Rizin
unix-like reverse engineering framework and cli tools
regcomp.c
Go to the documentation of this file.
1 /* $OpenBSD: regcomp.c,v 1.20 2010/11/21 00:02:30 tedu Exp $ */
2 /*-
3  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4  * Copyright (c) 1992, 1993, 1994
5  * The Regents of the University of California. All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Henry Spencer.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  * may be used to endorse or promote products derived from this software
20  * without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
35  */
36 
37 #include <sys/types.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <stdlib.h>
43 #include "rz_regex.h"
44 #include "rz_util/rz_str.h"
45 #include "rz_util/rz_assert.h"
46 
47 #include "utils.h"
48 #include "regex2.h"
49 
50 #include "cclass.h"
51 #include "cname.h"
52 
53 /*
54  * parse structure, passed up and down to avoid global variables and
55  * other clumsinesses
56  */
57 struct parse {
58  char *next; /* next character in RE */
59  char *end; /* end of string (-> NUL normally) */
60  int error; /* has an error been seen? */
61  sop *strip; /* malloced strip */
62  sopno ssize; /* malloced strip size (allocated) */
63  sopno slen; /* malloced strip length (used) */
64  int ncsalloc; /* number of csets allocated */
65  struct re_guts *g;
66 #define NPAREN 10 /* we need to remember () 1-9 for back refs */
67  sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
68  sopno pend[NPAREN]; /* -> ) ([0] unused) */
69 };
70 
71 static void p_ere(struct parse *, int);
72 static void p_ere_exp(struct parse *);
73 static void p_str(struct parse *);
74 static void p_bre(struct parse *, int, int);
75 static int p_simp_re(struct parse *, int);
76 static int p_count(struct parse *);
77 static void p_bracket(struct parse *);
78 static void p_b_term(struct parse *, cset *);
79 static void p_b_cclass(struct parse *, cset *);
80 static void p_b_eclass(struct parse *, cset *);
81 static char p_b_symbol(struct parse *);
82 static char p_b_coll_elem(struct parse *, int);
83 static char othercase(int);
84 static void bothcases(struct parse *, int);
85 static void ordinary(struct parse *, int);
86 static void special(struct parse *, int);
87 static void nonnewline(struct parse *);
88 static void repeat(struct parse *, sopno, int, int);
89 static int seterr(struct parse *, int);
90 static cset *allocset(struct parse *);
91 static void freeset(struct parse *, cset *);
92 static int freezeset(struct parse *, cset *);
93 static int firstch(struct parse *, cset *);
94 static int nch(struct parse *, cset *);
95 static void mcadd(struct parse *, cset *, char *);
96 static void mcinvert(struct parse *, cset *);
97 static void mccase(struct parse *, cset *);
98 static int isinsets(struct re_guts *, int);
99 static int samesets(struct re_guts *, int, int);
100 static void categorize(struct parse *, struct re_guts *);
101 static sopno dupl(struct parse *, sopno, sopno);
102 static void doemit(struct parse *, sop, size_t);
103 static void doinsert(struct parse *, sop, size_t, sopno);
104 static void dofwd(struct parse *, sopno, sop);
105 static void enlarge(struct parse *, sopno);
106 static void stripsnug(struct parse *, struct re_guts *);
107 static void findmust(struct parse *, struct re_guts *);
108 static sopno pluscount(struct parse *, struct re_guts *);
109 
110 static char nuls[10]; /* place to point scanner in event of error */
111 
112 /*
113  * macros for use with parse structure
114  * BEWARE: these know that the parse structure is named `p' !!!
115  */
116 #define PEEK() (*p->next)
117 #define PEEK2() (*(p->next + 1))
118 #define MORE() (p->next < p->end)
119 #define MORE2() (p->next + 1 < p->end)
120 #define SEE(c) (MORE() && PEEK() == (c))
121 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
122 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
123 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
124 #define NEXT() (p->next++)
125 #define NEXT2() (p->next += 2)
126 #define NEXTn(n) (p->next += (n))
127 #define GETNEXT() (*p->next++)
128 #define SETERROR(e) seterr(p, (e))
129 #define REQUIRE(co, e) (void)((co) || SETERROR(e))
130 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
131 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
132 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
133 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
134 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE() - (pos) + 1, pos)
135 #define AHEAD(pos) dofwd(p, pos, HERE() - (pos))
136 #define ASTERN(sop, pos) EMIT(sop, HERE() - (pos))
137 #define HERE() (p->slen)
138 #define THERE() (p->slen - 1)
139 #define THERETHERE() (p->slen - 2)
140 #define DROP(n) (p->slen -= (n))
141 
142 RZ_API int rz_regex_match(const char *pattern, const char *flags, const char *text) {
143  int ret;
144  RzRegex rx;
145  int re_flags = rz_regex_flags(flags);
146  if (rz_regex_comp(&rx, pattern, re_flags)) {
147  eprintf("FAIL TO COMPILE %s\n", pattern);
148  return 0;
149  }
150  ret = rz_regex_exec(&rx, text, 0, 0, re_flags);
151  rz_regex_fini(&rx);
152  return ret ? 0 : 1;
153 }
154 
155 RZ_API RzList *rz_regex_get_match_list(const char *pattern, const char *flags, const char *text) {
157  RzRegex rx;
159  char *entry;
160  size_t entry_len = 0;
161  int re_flags = rz_regex_flags(flags);
162  if (rz_regex_comp(&rx, pattern, re_flags)) {
163  eprintf("Failed to compile regexp: %s\n", pattern);
164  return NULL;
165  }
166 
167  /* Initialize the boundaries for RZ_REGEX_STARTEND */
168  match.rm_so = 0;
169  match.rm_eo = strlen(text);
170  while (!rz_regex_exec(&rx, text, 1, &match, re_flags | RZ_REGEX_STARTEND)) {
171  entry_len = match.rm_eo - match.rm_so + 1;
172  entry = RZ_NEWS0(char, entry_len);
173  rz_str_ncpy(entry, text + match.rm_so, entry_len);
175  /* Update the boundaries for RZ_REGEX_STARTEND */
176  match.rm_so = match.rm_eo;
177  match.rm_eo = strlen(text);
178  }
179  rz_regex_fini(&rx);
180  return list;
181 }
182 
183 RZ_API RzRegex *rz_regex_new(const char *pattern, const char *flags) {
184  rz_return_val_if_fail(pattern, NULL);
185  RzRegex *r, rx = { 0 };
186  if (rz_regex_comp(&rx, pattern, rz_regex_flags(flags))) {
187  return NULL;
188  }
189  r = RZ_NEW(RzRegex);
190  if (!r) {
191  return NULL;
192  }
193  memcpy(r, &rx, sizeof(RzRegex));
194  return r;
195 }
196 
197 RZ_API int rz_regex_flags(const char *f) {
198  int flags = 0;
199  if (!f || !*f) {
200  return 0;
201  }
202  if (strchr(f, 'e')) {
204  }
205  if (strchr(f, 'i')) {
207  }
208  if (strchr(f, 's')) {
210  }
211  if (strchr(f, 'n')) {
213  }
214  if (strchr(f, 'N')) {
216  }
217  if (strchr(f, 'p')) {
218  flags |= RZ_REGEX_PEND;
219  }
220  if (strchr(f, 'd')) {
221  flags |= RZ_REGEX_DUMP;
222  }
223  return flags;
224 }
225 
227  struct re_guts *g;
228  if (!preg) {
229  return;
230  }
231  if (preg->re_magic != MAGIC1) { /* oops */
232  return; /* nice to complain, but hard */
233  }
234 
235  g = preg->re_g;
236  if (!g || g->magic != MAGIC2) { /* oops again */
237  return;
238  }
239  preg->re_magic = 0; /* mark it invalid */
240  g->magic = 0; /* mark it invalid */
241 
242  free(g->strip);
243  free(g->sets);
244  free(g->setbits);
245  free(g->must);
246  free(g);
247 }
248 
250  rz_regex_fini(preg);
251  free(preg);
252 }
253 
254 /*
255  - regcomp - interface for parser and compilation
256  - 0 success, otherwise RZ_REGEX_something
257  */
258 RZ_API int rz_regex_comp(RzRegex *preg, const char *pattern, int cflags) {
259  struct parse pa;
260  struct re_guts *g;
261  struct parse *p = &pa;
262  int i;
263  size_t len;
264 #ifdef REDEBUG
265 #define GOODFLAGS(f) (f)
266 #else
267 #define GOODFLAGS(f) ((f) & ~RZ_REGEX_DUMP)
268 #endif
269  cflags = GOODFLAGS(cflags);
270  if (!preg || ((cflags & RZ_REGEX_EXTENDED) && (cflags & RZ_REGEX_NOSPEC))) {
271  return RZ_REGEX_INVARG;
272  }
273  if (cflags & RZ_REGEX_PEND) {
274  if (preg->re_endp < pattern) {
275  return RZ_REGEX_INVARG;
276  }
277  len = preg->re_endp - pattern;
278  } else {
279  len = strlen((char *)pattern);
280  }
281  /* do the mallocs early so failure handling is easy */
282  g = calloc(1, sizeof(struct re_guts) + (NC - 1));
283  if (!g) {
284  return RZ_REGEX_ESPACE;
285  }
286  /*
287  * Limit the pattern space to avoid a 32-bit overflow on buffer
288  * extension. Also avoid any signed overflow in case of conversion
289  * so make the real limit based on a 31-bit overflow.
290  *
291  * Likely not applicable on 64-bit systems but handle the case
292  * generically (who are we to stop people from using ~715MB+
293  * patterns?).
294  */
295  size_t maxlen = ((size_t)-1 >> 1) / sizeof(sop) * 2 / 3;
296  if (len >= maxlen) {
297  free(g);
298  return RZ_REGEX_ESPACE;
299  }
300  preg->re_flags = cflags;
301  p->ssize = len / (size_t)2 * (size_t)3 + (size_t)1; /* ugh */
302  if (p->ssize < len) {
303  free(g);
304  return RZ_REGEX_ESPACE;
305  }
306 
307  p->strip = (sop *)calloc(p->ssize, sizeof(sop));
308  if (!p->strip) {
309  free(g);
310  return RZ_REGEX_ESPACE;
311  }
312  p->slen = 0;
313  if (!p->strip) {
314  free(g);
315  return RZ_REGEX_ESPACE;
316  }
317 
318  /* set things up */
319  p->g = g;
320  p->next = (char *)pattern; /* convenience; we do not modify it */
321  p->end = p->next + len;
322  p->error = 0;
323  p->ncsalloc = 0;
324  for (i = 0; i < NPAREN; i++) {
325  p->pbegin[i] = 0;
326  p->pend[i] = 0;
327  }
328  g->csetsize = NC;
329  g->sets = NULL;
330  g->setbits = NULL;
331  g->ncsets = 0;
332  g->cflags = cflags;
333  g->iflags = 0;
334  g->nbol = 0;
335  g->neol = 0;
336  g->must = NULL;
337  g->mlen = 0;
338  g->nsub = 0;
339  g->ncategories = 1; /* category 0 is "everything else" */
340  g->categories = &g->catspace[-(CHAR_MIN)];
341  (void)memset((char *)g->catspace, 0, NC * sizeof(cat_t));
342  g->backrefs = 0;
343 
344  /* do it */
345  EMIT(OEND, 0);
346  g->firststate = THERE();
347  if (cflags & RZ_REGEX_EXTENDED) {
348  p_ere(p, OUT);
349  } else if (cflags & RZ_REGEX_NOSPEC) {
350  p_str(p);
351  } else {
352  p_bre(p, OUT, OUT);
353  }
354  EMIT(OEND, 0);
355  g->laststate = THERE();
356 
357  /* tidy up loose ends and fill things in */
358  categorize(p, g);
359  stripsnug(p, g);
360  findmust(p, g);
361  g->nplus = pluscount(p, g);
362  g->magic = MAGIC2;
363  preg->re_nsub = g->nsub;
364  preg->re_g = g;
365  preg->re_magic = MAGIC1;
366 #ifndef REDEBUG
367  /* not debugging, so can't rely on the asssert() in regexec() */
368  if (g->iflags & BAD) {
370  }
371 #endif
372  if (p->error) {
373  rz_regex_fini(preg);
374  }
375  return p->error;
376 }
377 
378 /*
379  - p_ere - ERE parser top level, concatenation and alternation
380  */
381 static void p_ere(struct parse *p, int stop) { /* character this ERE should end at */
382  bool isFirst = true;
383  sopno prevback = 0;
384  sopno prevfwd = 0;
385  sopno conc = 0;
386  char c;
387 
388  for (;;) {
389  /* do a bunch of concatenated expressions */
390  conc = HERE();
391  while (MORE() && (c = PEEK()) != '|' && c != stop) {
392  p_ere_exp(p);
393  }
394  REQUIRE(HERE() != conc, RZ_REGEX_EMPTY); /* require nonempty */
395 
396  if (!EAT('|')) {
397  break; /* NOTE BREAK OUT */
398  }
399  if (isFirst) {
400  INSERT(OCH_, conc); /* offset is wrong */
401  prevfwd = conc;
402  prevback = conc;
403  isFirst = false;
404  }
405  ASTERN(OOR1, prevback);
406  prevback = THERE();
407  AHEAD(prevfwd); /* fix previous offset */
408  prevfwd = HERE();
409  EMIT(OOR2, 0); /* offset is very wrong */
410  }
411 
412  if (!isFirst) { /* tail-end fixups */
413  AHEAD(prevfwd);
414  ASTERN(O_CH, prevback);
415  }
416  // asert(!MORE() || SEE(stop));
417 }
418 
419 /*
420  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
421  */
422 static void p_ere_exp(struct parse *p) {
423  char c;
424  sopno pos;
425  int count;
426  int count2;
427  sopno subno;
428  int wascaret = 0;
429 
430  if (!MORE()) { /* caller should have ensured this */
431  return;
432  }
433  c = GETNEXT();
434 
435  pos = HERE();
436  switch (c) {
437  case '(':
439  p->g->nsub++;
440  subno = p->g->nsub;
441  if (subno < NPAREN) {
442  p->pbegin[subno] = HERE();
443  }
444  EMIT(OLPAREN, subno);
445  if (!SEE(')')) {
446  p_ere(p, ')');
447  }
448  if (subno < NPAREN) {
449  p->pend[subno] = HERE();
450  if (!p->pend[subno]) {
451  break;
452  }
453  }
454  EMIT(ORPAREN, subno);
455  MUSTEAT(')', RZ_REGEX_EPAREN);
456  break;
457  case '^':
458  EMIT(OBOL, 0);
459  p->g->iflags |= USEBOL;
460  p->g->nbol++;
461  wascaret = 1;
462  break;
463  case '$':
464  EMIT(OEOL, 0);
465  p->g->iflags |= USEEOL;
466  p->g->neol++;
467  break;
468  case '|':
470  break;
471  case '*':
472  case '+':
473  case '?':
475  break;
476  case '.':
477  if (p->g->cflags & RZ_REGEX_NEWLINE) {
478  nonnewline(p);
479  } else {
480  EMIT(OANY, 0);
481  }
482  break;
483  case '[':
484  p_bracket(p);
485  break;
486  case '\\':
488  c = GETNEXT();
489  if (!isalpha(c)) {
490  ordinary(p, c);
491  } else {
492  special(p, c);
493  }
494  break;
495  case '{': /* okay as ordinary except if digit follows */
497  /* FALLTHROUGH */
498  default:
499  ordinary(p, c);
500  break;
501  }
502 
503  if (!MORE()) {
504  return;
505  }
506  c = PEEK();
507  /* we call { a repetition if followed by a digit */
508  if (!(c == '*' || c == '+' || c == '?' ||
509  (c == '{' && MORE2() && isdigit((ut8)PEEK2())))) {
510  return; /* no repetition, we're done */
511  }
512  NEXT();
513 
514  REQUIRE(!wascaret, RZ_REGEX_BADRPT);
515  switch (c) {
516  case '*': /* implemented as +? */
517  /* this case does not require the (y|) trick, noKLUDGE */
518  INSERT(OPLUS_, pos);
519  ASTERN(O_PLUS, pos);
520  INSERT(OQUEST_, pos);
521  ASTERN(O_QUEST, pos);
522  break;
523  case '+':
524  INSERT(OPLUS_, pos);
525  ASTERN(O_PLUS, pos);
526  break;
527  case '?':
528  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
529  INSERT(OCH_, pos); /* offset slightly wrong */
530  ASTERN(OOR1, pos); /* this one's right */
531  AHEAD(pos); /* fix the OCH_ */
532  EMIT(OOR2, 0); /* offset very wrong... */
533  AHEAD(THERE()); /* ...so fix it */
534  ASTERN(O_CH, THERETHERE());
535  break;
536  case '{':
537  count = p_count(p);
538  if (EAT(',')) {
539  if (isdigit((ut8)PEEK())) {
540  count2 = p_count(p);
541  REQUIRE(count <= count2, RZ_REGEX_BADBR);
542  } else { /* single number with comma */
543  count2 = INTFINITY;
544  }
545  } else { /* just a single number */
546  count2 = count;
547  }
548  repeat(p, pos, count, count2);
549  if (!EAT('}')) { /* error heuristics */
550  while (MORE() && PEEK() != '}') {
551  NEXT();
552  }
555  }
556  break;
557  }
558 
559  if (!MORE()) {
560  return;
561  }
562  c = PEEK();
563  if (!(c == '*' || c == '+' || c == '?' ||
564  (c == '{' && MORE2() && isdigit((ut8)PEEK2())))) {
565  return;
566  }
568 }
569 
570 /*
571  - p_str - string (no metacharacters) "parser"
572  */
573 static void p_str(struct parse *p) {
575  while (MORE()) {
576  ordinary(p, GETNEXT());
577  }
578 }
579 
580 /*
581  - p_bre - BRE parser top level, anchoring and concatenation
582  * Giving end1 as OUT essentially eliminates the end1/end2 check.
583  *
584  * This implementation is a bit of a kludge, in that a trailing $ is first
585  * taken as an ordinary character and then revised to be an anchor. The
586  * only undesirable side effect is that '$' gets included as a character
587  * category in such cases. This is fairly harmless; not worth fixing.
588  * The amount of lookahead needed to avoid this kludge is excessive.
589  */
590 static void p_bre(struct parse *p,
591  int end1, /* first terminating character */
592  int end2) /* second terminating character */
593 {
594  sopno start = HERE();
595  int first = 1; /* first subexpression? */
596  int wasdollar = 0;
597 
598  if (EAT('^')) {
599  EMIT(OBOL, 0);
600  p->g->iflags |= USEBOL;
601  p->g->nbol++;
602  }
603  while (MORE() && !SEETWO(end1, end2)) {
604  wasdollar = p_simp_re(p, first);
605  first = 0;
606  }
607  if (wasdollar) { /* oops, that was a trailing anchor */
608  DROP(1);
609  EMIT(OEOL, 0);
610  p->g->iflags |= USEEOL;
611  p->g->neol++;
612  }
613 
614  REQUIRE(HERE() != start, RZ_REGEX_EMPTY); /* require nonempty */
615 }
616 
617 /*
618  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
619  */
620 static int /* was the simple RE an unbackslashed $? */
621 p_simp_re(struct parse *p,
622  int starordinary) /* is a leading * an ordinary character? */
623 {
624  int c;
625  int count;
626  int count2;
627  sopno pos;
628  int i;
629  sopno subno;
630 #define BACKSL (1 << CHAR_BIT)
631 
632  pos = HERE(); /* repetion op, if any, covers from here */
633 
634  if (!MORE()) { /* caller should have ensured this */
635  return 0;
636  }
637  c = GETNEXT();
638  if (c == '\\') {
640  c = BACKSL | GETNEXT();
641  }
642  switch (c) {
643  case '.':
644  if (p->g->cflags & RZ_REGEX_NEWLINE) {
645  nonnewline(p);
646  } else {
647  EMIT(OANY, 0);
648  }
649  break;
650  case '[':
651  p_bracket(p);
652  break;
653  case BACKSL | '{':
655  break;
656  case BACKSL | '(':
657  p->g->nsub++;
658  subno = p->g->nsub;
659  if (subno < NPAREN) {
660  p->pbegin[subno] = HERE();
661  }
662  EMIT(OLPAREN, subno);
663  /* the MORE here is an error heuristic */
664  if (MORE() && !SEETWO('\\', ')')) {
665  p_bre(p, '\\', ')');
666  }
667  if (subno < NPAREN) {
668  p->pend[subno] = HERE();
669  if (!p->pend[subno]) {
670  break;
671  }
672  }
673  EMIT(ORPAREN, subno);
674  REQUIRE(EATTWO('\\', ')'), RZ_REGEX_EPAREN);
675  break;
676  case BACKSL | ')': /* should not get here -- must be user */
677  case BACKSL | '}':
679  break;
680  case BACKSL | '1':
681  case BACKSL | '2':
682  case BACKSL | '3':
683  case BACKSL | '4':
684  case BACKSL | '5':
685  case BACKSL | '6':
686  case BACKSL | '7':
687  case BACKSL | '8':
688  case BACKSL | '9':
689  i = (c & ~BACKSL) - '0';
690  if (p->pend[i] != 0) {
691  if (i <= p->g->nsub) {
692  EMIT(OBACK_, i);
693  if (p->pbegin[i] != 0 && OP(p->strip[p->pbegin[i]]) == OLPAREN &&
694  OP(p->strip[p->pend[i]]) == ORPAREN) {
695  (void)dupl(p, p->pbegin[i] + 1, p->pend[i]);
696  EMIT(O_BACK, i);
697  }
698  }
699  } else {
701  }
702  p->g->backrefs = 1;
703  break;
704  case '*':
705  REQUIRE(starordinary, RZ_REGEX_BADRPT);
706  /* FALLTHROUGH */
707  default:
708  ordinary(p, (char)c);
709  break;
710  }
711 
712  if (EAT('*')) { /* implemented as +? */
713  /* this case does not require the (y|) trick, noKLUDGE */
714  INSERT(OPLUS_, pos);
715  ASTERN(O_PLUS, pos);
716  INSERT(OQUEST_, pos);
717  ASTERN(O_QUEST, pos);
718  } else if (EATTWO('\\', '{')) {
719  count = p_count(p);
720  if (EAT(',')) {
721  if (MORE() && isdigit((ut8)PEEK())) {
722  count2 = p_count(p);
723  REQUIRE(count <= count2, RZ_REGEX_BADBR);
724  } else { /* single number with comma */
725  count2 = INTFINITY;
726  }
727  } else { /* just a single number */
728  count2 = count;
729  }
730  repeat(p, pos, count, count2);
731  if (!EATTWO('\\', '}')) { /* error heuristics */
732  while (MORE() && !SEETWO('\\', '}')) {
733  NEXT();
734  }
737  }
738  } else if (c == '$') { /* $ (but not \$) ends it */
739  return (1);
740  }
741 
742  return (0);
743 }
744 
745 /*
746  - p_count - parse a repetition count
747  */
748 static int /* the value */
749 p_count(struct parse *p) {
750  int count = 0;
751  int ndigits = 0;
752 
753  while (MORE() && isdigit((ut8)PEEK()) && count <= DUPMAX) {
754  count = count * 10 + (GETNEXT() - '0');
755  ndigits++;
756  }
757 
758  REQUIRE(ndigits > 0 && count <= DUPMAX, RZ_REGEX_BADBR);
759  return (count);
760 }
761 
762 /*
763  - p_bracket - parse a bracketed character list
764  *
765  * Note a significant property of this code: if the allocset() did SETERROR,
766  * no set operations are done.
767  */
768 static void p_bracket(struct parse *p) {
769  cset *cs;
770  int invert = 0;
771 
772  /* Dept of Truly Sickening Special-Case Kludges */
773  if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
774  EMIT(OBOW, 0);
775  NEXTn(6);
776  return;
777  }
778  if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
779  EMIT(OEOW, 0);
780  NEXTn(6);
781  return;
782  }
783 
784  if (!(cs = allocset(p))) {
785  /* allocset did set error status in p */
786  return;
787  }
788 
789  if (EAT('^')) {
790  invert++; /* make note to invert set at end */
791  }
792  if (EAT(']')) {
793  CHadd(cs, ']');
794  } else if (EAT('-')) {
795  CHadd(cs, '-');
796  }
797  while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) {
798  p_b_term(p, cs);
799  }
800  if (EAT('-')) {
801  CHadd(cs, '-');
802  }
803  MUSTEAT(']', RZ_REGEX_EBRACK);
804 
805  if (p->error != 0) { /* don't mess things up further */
806  freeset(p, cs);
807  return;
808  }
809 
810  if (p->g->cflags & RZ_REGEX_ICASE) {
811  int i;
812  int ci;
813 
814  for (i = p->g->csetsize - 1; i >= 0; i--) {
815  if (CHIN(cs, i) && isalpha(i)) {
816  ci = othercase(i);
817  if (ci != i) {
818  CHadd(cs, ci);
819  }
820  }
821  }
822  if (cs->multis != NULL) {
823  mccase(p, cs);
824  }
825  }
826  if (invert) {
827  int i;
828 
829  for (i = p->g->csetsize - 1; i >= 0; i--) {
830  if (CHIN(cs, i)) {
831  CHsub(cs, i);
832  } else {
833  CHadd(cs, i);
834  }
835  }
836  if (p->g->cflags & RZ_REGEX_NEWLINE) {
837  CHsub(cs, '\n');
838  }
839  if (cs->multis != NULL) {
840  mcinvert(p, cs);
841  }
842  }
843 
844  if (cs->multis) { /* xxx */
845  return;
846  }
847 
848  if (nch(p, cs) == 1) { /* optimize singleton sets */
849  ordinary(p, firstch(p, cs));
850  freeset(p, cs);
851  } else {
852  EMIT(OANYOF, freezeset(p, cs));
853  }
854 }
855 
856 /*
857  - p_b_term - parse one term of a bracketed character list
858  */
859 static void p_b_term(struct parse *p, cset *cs) {
860  char c;
861  char start = 0, finish;
862  int i;
863 
864  /* classify what we've got */
865  switch ((MORE()) ? PEEK() : '\0') {
866  case '[':
867  c = (MORE2()) ? PEEK2() : '\0';
868  break;
869  case '-':
871  return; /* NOTE RETURN */
872  break;
873  default:
874  c = '\0';
875  break;
876  }
877 
878  switch (c) {
879  case ':': /* character class */
880  NEXT2();
882  c = PEEK();
883  REQUIRE(c != '-' && c != ']', RZ_REGEX_ECTYPE);
884  p_b_cclass(p, cs);
886  REQUIRE(EATTWO(':', ']'), RZ_REGEX_ECTYPE);
887  break;
888  case '=': /* equivalence class */
889  NEXT2();
891  c = PEEK();
892  REQUIRE(c != '-' && c != ']', RZ_REGEX_ECOLLATE);
893  p_b_eclass(p, cs);
895  REQUIRE(EATTWO('=', ']'), RZ_REGEX_ECOLLATE);
896  break;
897  default: /* symbol, ordinary character, or range */
898  /* xxx revision needed for multichar stuff */
899  start = p_b_symbol(p);
900  if (SEE('-') && MORE2() && PEEK2() != ']') {
901  /* range */
902  NEXT();
903  if (EAT('-')) {
904  finish = '-';
905  } else {
906  finish = p_b_symbol(p);
907  }
908  } else {
909  finish = start;
910  }
911  /* xxx what about signed chars here... */
913  for (i = start; i <= finish; i++) {
914  CHadd(cs, i);
915  }
916  break;
917  }
918 }
919 
920 /*
921  - p_b_cclass - parse a character-class name and deal with it
922  */
923 static void p_b_cclass(struct parse *p, cset *cs) {
924  char *sp = p->next;
925  struct cclass *cp;
926  size_t len;
927  char *u;
928  char c;
929 
930  while (MORE() && isalpha((unsigned char)PEEK())) {
931  NEXT();
932  }
933  len = p->next - sp;
934  for (cp = cclasses; cp->name != NULL; cp++) {
935  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') {
936  break;
937  }
938  }
939  if (!cp->name) {
940  /* oops, didn't find it */
942  return;
943  }
944 
945  u = cp->chars;
946  while ((c = *u++) != '\0') {
947  CHadd(cs, c);
948  }
949  for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) {
950  MCadd(p, cs, u);
951  }
952 }
953 
954 /*
955  - p_b_eclass - parse an equivalence-class name and deal with it
956  *
957  * This implementation is incomplete. xxx
958  */
959 static void p_b_eclass(struct parse *p, cset *cs) {
960  char c;
961 
962  c = p_b_coll_elem(p, '=');
963  CHadd(cs, c);
964 }
965 
966 /*
967  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
968  */
969 static char /* value of symbol */
970 p_b_symbol(struct parse *p) {
971  char value;
972 
974  if (!EATTWO('[', '.')) {
975  return (GETNEXT());
976  }
977 
978  /* collating symbol */
979  value = p_b_coll_elem(p, '.');
980  REQUIRE(EATTWO('.', ']'), RZ_REGEX_ECOLLATE);
981  return (value);
982 }
983 
984 /*
985  - p_b_coll_elem - parse a collating-element name and look it up
986  */
987 static char /* value of collating element */
989  int endc) /* name ended by endc,']' */
990 {
991  char *sp = p->next;
992  struct cname *cp;
993  int len;
994 
995  while (MORE() && !SEETWO(endc, ']')) {
996  NEXT();
997  }
998  if (!MORE()) {
1000  return (0);
1001  }
1002  len = p->next - sp;
1003  for (cp = cnames; cp->name != NULL; cp++) {
1004  if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') {
1005  return (cp->code); /* known name */
1006  }
1007  }
1008  if (len == 1) {
1009  return (*sp); /* single character */
1010  }
1011  SETERROR(RZ_REGEX_ECOLLATE); /* neither */
1012  return (0);
1013 }
1014 
1015 /*
1016  - othercase - return the case counterpart of an alphabetic
1017  */
1018 static char /* if no counterpart, return ch */
1019 othercase(int ch) {
1020  ch = (ut8)ch;
1021  if (isalpha(ch)) {
1022  if (isupper(ch)) {
1023  return ((ut8)tolower(ch));
1024  } else if (islower(ch)) {
1025  return ((ut8)toupper(ch));
1026  } else { /* peculiar, but could happen */
1027  return (ch);
1028  }
1029  }
1030  return ch;
1031 }
1032 
1033 /*
1034  - bothcases - emit a dualcase version of a two-case character
1035  *
1036  * Boy, is this implementation ever a kludge...
1037  */
1038 static void bothcases(struct parse *p, int ch) {
1039  char *oldnext = p->next;
1040  char *oldend = p->end;
1041  char bracket[3];
1042 
1043  ch = (ut8)ch;
1044  if (othercase(ch) != ch) { /* p_bracket() would recurse */
1045  p->next = bracket;
1046  p->end = bracket + 2;
1047  bracket[0] = ch;
1048  bracket[1] = ']';
1049  bracket[2] = '\0';
1050  p_bracket(p);
1051  if (p->next == bracket + 2) {
1052  p->next = oldnext;
1053  p->end = oldend;
1054  }
1055  }
1056 }
1057 
1058 /*
1059  - ordinary - emit an ordinary character
1060  */
1061 static void
1062 ordinary(struct parse *p, int ch) {
1063  cat_t *cap = p->g->categories;
1064 
1065  if ((p->g->cflags & RZ_REGEX_ICASE) && isalpha((ut8)ch) && othercase(ch) != ch) {
1066  bothcases(p, ch);
1067  } else {
1068  EMIT(OCHAR, (ut8)ch);
1069  if (cap[ch] == 0) {
1070  cap[ch] = p->g->ncategories++;
1071  }
1072  }
1073 }
1074 
1075 static void
1076 special(struct parse *p, int ch) {
1077  char *oldnext = p->next;
1078  char *oldend = p->end;
1079  char bracket[16] = { 0 };
1080  char digits[3] = { 0 };
1081  char c;
1082  int num = 0;
1083  switch (ch) {
1084  case 'x':
1085  digits[0] = GETNEXT();
1086  digits[1] = GETNEXT();
1087  c = (char)strtol(digits, NULL, 16);
1088  ordinary(p, c);
1089  return;
1090  case 'n':
1091  ordinary(p, '\n');
1092  return;
1093  case 't':
1094  ordinary(p, '\t');
1095  return;
1096  case 'r':
1097  ordinary(p, '\r');
1098  return;
1099  case 's':
1100  num = 5;
1101  memcpy(bracket, "\t\r\n ]", num);
1102  break;
1103  case 'd':
1104  num = 4;
1105  memcpy(bracket, "0-9]", num);
1106  break;
1107  case 'w':
1108  num = 4;
1109  memcpy(bracket, "a-z]", num);
1110  break;
1111  default:
1113  return;
1114  }
1115 
1116  p->next = bracket;
1117  p->end = bracket + num;
1118 
1119  p_bracket(p);
1120 
1121  if (p->next == bracket + num) {
1122  p->next = oldnext;
1123  p->end = oldend;
1124  }
1125 }
1126 
1127 /*
1128  - nonnewline - emit RZ_REGEX_NEWLINE version of OANY
1129  *
1130  * Boy, is this implementation ever a kludge...
1131  */
1132 static void
1133 nonnewline(struct parse *p) {
1134  char *oldnext = p->next;
1135  char *oldend = p->end;
1136  char bracket[4];
1137 
1138  p->next = bracket;
1139  p->end = bracket + 3;
1140  bracket[0] = '^';
1141  bracket[1] = '\n';
1142  bracket[2] = ']';
1143  bracket[3] = '\0';
1144  p_bracket(p);
1145  if (p->next == bracket + 3) {
1146  p->next = oldnext;
1147  p->end = oldend;
1148  }
1149 }
1150 
1151 /*
1152  - repeat - generate code for a bounded repetition, recursively if needed
1153  */
1154 static void
1155 repeat(struct parse *p,
1156  sopno start, /* operand from here to end of strip */
1157  int from, /* repeated from this number */
1158  int to) /* to this number of times (maybe INTFINITY) */
1159 {
1160  sopno finish = HERE();
1161 #define N 2
1162 #define INF 3
1163 #define REP(f, t) ((f)*8 + (t))
1164 #define MAP(n) (((n) <= 1) ? (n) : ((n) == INTFINITY) ? INF \
1165  : N)
1166  sopno copy;
1167 
1168  if (p->error != 0) { /* head off possible runaway recursion */
1169  return;
1170  }
1171 
1172  if (from > to) {
1173  return;
1174  }
1175 
1176  switch (REP(MAP(from), MAP(to))) {
1177  case REP(0, 0): /* must be user doing this */
1178  DROP(finish - start); /* drop the operand */
1179  break;
1180  case REP(0, 1): /* as x{1,1}? */
1181  case REP(0, N): /* as x{1,n}? */
1182  case REP(0, INF): /* as x{1,}? */
1183  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1184  INSERT(OCH_, start); /* offset is wrong... */
1185  repeat(p, start + 1, 1, to);
1186  ASTERN(OOR1, start);
1187  AHEAD(start); /* ... fix it */
1188  EMIT(OOR2, 0);
1189  AHEAD(THERE());
1190  ASTERN(O_CH, THERETHERE());
1191  break;
1192  case REP(1, 1): /* trivial case */
1193  /* done */
1194  break;
1195  case REP(1, N): /* as x?x{1,n-1} */
1196  /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1197  INSERT(OCH_, start);
1198  ASTERN(OOR1, start);
1199  AHEAD(start);
1200  EMIT(OOR2, 0); /* offset very wrong... */
1201  AHEAD(THERE()); /* ...so fix it */
1202  ASTERN(O_CH, THERETHERE());
1203  copy = dupl(p, start + 1, finish + 1);
1204  if (copy == finish + 4) {
1205  repeat(p, copy, 1, to - 1);
1206  }
1207  break;
1208  case REP(1, INF): /* as x+ */
1209  INSERT(OPLUS_, start);
1210  ASTERN(O_PLUS, start);
1211  break;
1212  case REP(N, N): /* as xx{m-1,n-1} */
1213  copy = dupl(p, start, finish);
1214  repeat(p, copy, from - 1, to - 1);
1215  break;
1216  case REP(N, INF): /* as xx{n-1,INF} */
1217  copy = dupl(p, start, finish);
1218  repeat(p, copy, from - 1, to);
1219  break;
1220  default: /* "can't happen" */
1221  SETERROR(RZ_REGEX_ASSERT); /* just in case */
1222  break;
1223  }
1224 }
1225 
1226 /*
1227  - seterr - set an error condition
1228  */
1229 static int /* useless but makes type checking happy */
1230 seterr(struct parse *p, int e) {
1231  if (p->error == 0) { /* keep earliest error condition */
1232  p->error = e;
1233  }
1234  p->next = nuls; /* try to bring things to a halt */
1235  p->end = nuls;
1236  return (0); /* make the return value well-defined */
1237 }
1238 
1239 /*
1240  - allocset - allocate a set of characters for []
1241  */
1242 static cset *allocset(struct parse *p) {
1243  int no = p->g->ncsets++;
1244  size_t nc;
1245  size_t nbytes;
1246  cset *cs;
1247  size_t css = (size_t)p->g->csetsize;
1248  int i;
1249 
1250  if (no >= p->ncsalloc) { /* need another column of space */
1251  void *ptr;
1252 
1253  p->ncsalloc += CHAR_BIT;
1254  nc = p->ncsalloc;
1255  if (nc % CHAR_BIT) {
1256  goto nomem;
1257  }
1258  nbytes = nc / CHAR_BIT * css;
1259 
1260  ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1261  if (!ptr) {
1262  goto nomem;
1263  }
1264  p->g->sets = ptr;
1265 
1266  ptr = (ut8 *)realloc((char *)p->g->setbits, nbytes);
1267  if (!ptr) {
1268  goto nomem;
1269  }
1270  p->g->setbits = ptr;
1271 
1272  for (i = 0; i < no; i++) {
1273  p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
1274  }
1275 
1276  (void)memset((char *)p->g->setbits + (nbytes - css), 0, css);
1277  }
1278  /* XXX should not happen */
1279  if (!p->g->sets || !p->g->setbits) {
1280  goto nomem;
1281  }
1282 
1283  cs = &p->g->sets[no];
1284  cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT);
1285  cs->mask = 1 << ((no) % CHAR_BIT);
1286  cs->hash = 0;
1287  cs->smultis = 0;
1288  cs->multis = NULL;
1289 
1290  return (cs);
1291 nomem:
1292  RZ_FREE(p->g->sets);
1293  RZ_FREE(p->g->setbits);
1294 
1296  /* caller's responsibility not to do set ops */
1297  return (NULL);
1298 }
1299 
1300 /*
1301  - freeset - free a now-unused set
1302  */
1303 static void freeset(struct parse *p, cset *cs) {
1304  int i;
1305  cset *top = &p->g->sets[p->g->ncsets];
1306  size_t css = (size_t)p->g->csetsize;
1307 
1308  for (i = 0; i < css; i++) {
1309  CHsub(cs, i);
1310  }
1311  if (cs == top - 1) { /* recover only the easy case */
1312  p->g->ncsets--;
1313  }
1314 }
1315 
1316 /*
1317  - freezeset - final processing on a set of characters
1318  *
1319  * The main task here is merging identical sets. This is usually a waste
1320  * of time (although the hash code minimizes the overhead), but can win
1321  * big if RZ_REGEX_ICASE is being used. RZ_REGEX_ICASE, by the way, is why the hash
1322  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1323  * the same value!
1324  */
1325 static int /* set number */
1326 freezeset(struct parse *p, cset *cs) {
1327  ut8 h = cs->hash;
1328  int i;
1329  cset *top = &p->g->sets[p->g->ncsets];
1330  cset *cs2;
1331  size_t css = (size_t)p->g->csetsize;
1332 
1333  /* look for an earlier one which is the same */
1334  for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) {
1335  if (cs2->hash == h && cs2 != cs) {
1336  /* maybe */
1337  for (i = 0; i < css; i++) {
1338  if (!!CHIN(cs2, i) != !!CHIN(cs, i)) {
1339  break; /* no */
1340  }
1341  }
1342  if (i == css) {
1343  break; /* yes */
1344  }
1345  }
1346  }
1347 
1348  if (cs2 < top) { /* found one */
1349  freeset(p, cs);
1350  cs = cs2;
1351  }
1352 
1353  return ((int)(cs - p->g->sets));
1354 }
1355 
1356 /*
1357  - firstch - return first character in a set (which must have at least one)
1358  */
1359 static int /* character; there is no "none" value */
1360 firstch(struct parse *p, cset *cs) {
1361  int i;
1362  size_t css = (size_t)p->g->csetsize;
1363 
1364  for (i = 0; i < css; i++) {
1365  if (CHIN(cs, i)) {
1366  return ((char)i);
1367  }
1368  }
1369  return (0); /* arbitrary */
1370 }
1371 
1372 /*
1373  - nch - number of characters in a set
1374  */
1375 static int nch(struct parse *p, cset *cs) {
1376  int i;
1377  size_t css = (size_t)p->g->csetsize;
1378  int n = 0;
1379 
1380  for (i = 0; i < css; i++) {
1381  if (CHIN(cs, i)) {
1382  n++;
1383  }
1384  }
1385  return (n);
1386 }
1387 
1388 /*
1389  - mcadd - add a collating element to a cset
1390  */
1391 static void mcadd(struct parse *p, cset *cs, char *cp) {
1392  size_t oldend = cs->smultis;
1393  void *np;
1394 
1395  cs->smultis += strlen(cp) + 1;
1396  np = realloc(cs->multis, cs->smultis);
1397  if (!np) {
1398  if (cs->multis) {
1399  free(cs->multis);
1400  }
1401  cs->multis = NULL;
1403  return;
1404  }
1405  cs->multis = np;
1406 
1407  STRLCPY(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1408 }
1409 
1410 /*
1411  - mcinvert - invert the list of collating elements in a cset
1412  *
1413  * This would have to know the set of possibilities. Implementation
1414  * is deferred.
1415  */
1416 /* ARGSUSED */
1417 static void mcinvert(struct parse *p, cset *cs) {
1418  // asert(!cs->multis); /* xxx */
1419  return;
1420 }
1421 
1422 /*
1423  - mccase - add case counterparts of the list of collating elements in a cset
1424  *
1425  * This would have to know the set of possibilities. Implementation
1426  * is deferred.
1427  */
1428 /* ARGSUSED */
1429 static void mccase(struct parse *p, cset *cs) {
1430  // asert(!cs->multis); /* xxx */
1431  return;
1432 }
1433 
1434 /*
1435  - isinsets - is this character in any sets?
1436  */
1437 static int /* predicate */
1438 isinsets(struct re_guts *g, int c) {
1439  ut8 *col;
1440  int i;
1441  int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1442  unsigned uc = (ut8)c;
1443 
1444  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) {
1445  if (col[uc] != 0) {
1446  return (1);
1447  }
1448  }
1449  return (0);
1450 }
1451 
1452 /*
1453  - samesets - are these two characters in exactly the same sets?
1454  */
1455 static int /* predicate */
1456 samesets(struct re_guts *g, int c1, int c2) {
1457  ut8 *col;
1458  int i;
1459  int ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
1460  unsigned uc1 = (ut8)c1;
1461  unsigned uc2 = (ut8)c2;
1462 
1463  for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) {
1464  if (col[uc1] != col[uc2]) {
1465  return (0);
1466  }
1467  }
1468  return (1);
1469 }
1470 
1471 /*
1472  - categorize - sort out character categories
1473  */
1474 static void
1475 categorize(struct parse *p, struct re_guts *g) {
1476  cat_t *cats = g ? g->categories : NULL;
1477  int c;
1478  int c2;
1479  cat_t cat;
1480 
1481  /* avoid making error situations worse */
1482  if (!p || p->error != 0 || !cats) {
1483  return;
1484  }
1485 
1486  for (c = CHAR_MIN; c <= CHAR_MAX; c++) {
1487  if (*(cats + c) && isinsets(g, c)) {
1488  cat = g->ncategories++;
1489  cats[c] = cat;
1490  for (c2 = c + 1; c2 <= CHAR_MAX; c2++) {
1491  if (cats[c2] == 0 && samesets(g, c, c2)) {
1492  cats[c2] = cat;
1493  }
1494  }
1495  }
1496  }
1497 }
1498 
1499 /*
1500  - dupl - emit a duplicate of a bunch of sops
1501  */
1502 static sopno /* start of duplicate */
1503 dupl(struct parse *p,
1504  sopno start, /* from here */
1505  sopno finish) /* to this less one */
1506 {
1507  sopno ret = HERE();
1508  sopno len = finish - start;
1509 
1510  if (finish >= start) {
1511  if (len == 0) {
1512  return (ret);
1513  }
1514  enlarge(p, p->ssize + len); /* this many unexpected additions */
1515  if (p->ssize >= p->slen + len) {
1516  (void)memcpy((char *)(p->strip + p->slen),
1517  (char *)(p->strip + start), (size_t)len * sizeof(sop));
1518  p->slen += len;
1519  return (ret);
1520  }
1521  }
1522  return ret;
1523 }
1524 
1525 /*
1526  - doemit - emit a strip operator
1527  *
1528  * It might seem better to implement this as a macro with a function as
1529  * hard-case backup, but it's just too big and messy unless there are
1530  * some changes to the data structures. Maybe later.
1531  */
1532 static void
1533 doemit(struct parse *p, sop op, size_t opnd) {
1534  /* avoid making error situations worse */
1535  if (p->error != 0) {
1536  return;
1537  }
1538 
1539  /* deal with oversize operands ("can't happen", more or less) */
1540  if (opnd < 1 << OPSHIFT) {
1541 
1542  /* deal with undersized strip */
1543  if (p->slen >= p->ssize) {
1544  enlarge(p, (p->ssize + 1) / 2 * 3); /* +50% */
1545  }
1546  if (p->slen < p->ssize) {
1547  /* finally, it's all reduced to the easy case */
1548  p->strip[p->slen++] = SOP(op, opnd);
1549  }
1550  }
1551 }
1552 
1553 /*
1554  - doinsert - insert a sop into the strip
1555  */
1556 static void
1557 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) {
1558  sopno sn;
1559  sop s;
1560  int i;
1561 
1562  /* avoid making error situations worse */
1563  if (p->error != 0) {
1564  return;
1565  }
1566 
1567  sn = HERE();
1568  EMIT(op, opnd); /* do checks, ensure space */
1569  if (HERE() != sn + 1) {
1570  return;
1571  }
1572  s = p->strip[sn];
1573 
1574  /* adjust paren pointers */
1575  if (pos > 0) {
1576  for (i = 1; i < NPAREN; i++) {
1577  if (p->pbegin[i] >= pos) {
1578  p->pbegin[i]++;
1579  }
1580  if (p->pend[i] >= pos) {
1581  p->pend[i]++;
1582  }
1583  }
1584  }
1585 
1586  memmove((char *)&p->strip[pos + 1], (char *)&p->strip[pos],
1587  (HERE() - pos - 1) * sizeof(sop));
1588  p->strip[pos] = s;
1589 }
1590 
1591 /*
1592  - dofwd - complete a forward reference
1593  */
1594 static void
1596  /* avoid making error situations worse */
1597  if (p->error != 0) {
1598  return;
1599  }
1600 
1601  if (value < 1 << OPSHIFT) {
1602  p->strip[pos] = OP(p->strip[pos]) | value;
1603  }
1604 }
1605 
1606 /*
1607  - enlarge - enlarge the strip
1608  */
1609 static void
1610 enlarge(struct parse *p, sopno size) {
1611  sop *sp;
1612 
1613  if (p->ssize >= size) {
1614  return;
1615  }
1616 
1617  sp = (sop *)realloc(p->strip, size * sizeof(sop));
1618  if (!sp) {
1620  return;
1621  }
1622  p->strip = sp;
1623  p->ssize = size;
1624 }
1625 
1626 /*
1627  - stripsnug - compact the strip
1628  */
1629 static void
1630 stripsnug(struct parse *p, struct re_guts *g) {
1631  g->nstates = p->slen;
1632  g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1633  if (!g->strip) {
1635  g->strip = p->strip;
1636  }
1637 }
1638 
1639 /*
1640  - findmust - fill in must and mlen with longest mandatory literal string
1641  *
1642  * This algorithm could do fancy things like analyzing the operands of |
1643  * for common subsequences. Someday. This code is simple and finds most
1644  * of the interesting cases.
1645  *
1646  * Note that must and mlen got initialized during setup.
1647  */
1648 static void
1649 findmust(struct parse *p, struct re_guts *g) {
1650  sop *scan;
1651  sop *start = NULL; /* start initialized in the default case, after that */
1652  sop *newstart = NULL; /* newstart was initialized in the OCHAR case */
1653  sopno newlen;
1654  sop s;
1655  char *cp;
1656  sopno i;
1657 
1658  /* avoid making error situations worse */
1659  if (p->error != 0) {
1660  return;
1661  }
1662 
1663  /* find the longest OCHAR sequence in strip */
1664  newlen = 0;
1665  start = scan = g->strip + 1;
1666  do {
1667  s = *scan++;
1668  switch (OP(s)) {
1669  case OCHAR: /* sequence member */
1670  if (newlen == 0) { /* new sequence */
1671  newstart = scan - 1;
1672  }
1673  newlen++;
1674  break;
1675  case OPLUS_: /* things that don't break one */
1676  case OLPAREN:
1677  case ORPAREN:
1678  break;
1679  case OQUEST_: /* things that must be skipped */
1680  case OCH_:
1681  scan--;
1682  do {
1683  scan += OPND(s);
1684  s = *scan;
1685  /* asert() interferes w debug printouts */
1686  if (OP(s) != O_QUEST && OP(s) != O_CH &&
1687  OP(s) != OOR2) {
1688  g->iflags |= BAD;
1689  return;
1690  }
1691  } while (OP(s) != O_QUEST && OP(s) != O_CH);
1692  /* fallthrough */
1693  default: /* things that break a sequence */
1694  if (newlen > g->mlen) { /* ends one */
1695  start = newstart;
1696  g->mlen = newlen;
1697  }
1698  newlen = 0;
1699  break;
1700  }
1701  } while (OP(s) != OEND);
1702 
1703  if (g->mlen == 0) { /* there isn't one */
1704  return;
1705  }
1706 
1707  /* turn it into a character string */
1708  g->must = malloc((size_t)g->mlen + 1);
1709  if (!g->must) { /* argh; just forget it */
1710  g->mlen = 0;
1711  return;
1712  }
1713  cp = g->must;
1714  scan = start;
1715  for (i = g->mlen; i > 0; i--) {
1716  while (OP(s = *scan++) != OCHAR) {
1717  continue;
1718  }
1719  if (cp < g->must + g->mlen) {
1720  *cp++ = (char)OPND(s);
1721  }
1722  }
1723  if (cp == g->must + g->mlen) {
1724  *cp++ = '\0'; /* just on general principles */
1725  }
1726 }
1727 
1728 /*
1729  - pluscount - count + nesting
1730  */
1731 static sopno /* nesting depth */
1732 pluscount(struct parse *p, struct re_guts *g) {
1733  sop *scan;
1734  sop s;
1735  sopno plusnest = 0;
1736  sopno maxnest = 0;
1737 
1738  if (p->error != 0) {
1739  return (0); /* there may not be an OEND */
1740  }
1741 
1742  scan = g->strip + 1;
1743  do {
1744  s = *scan++;
1745  switch (OP(s)) {
1746  case OPLUS_:
1747  plusnest++;
1748  break;
1749  case O_PLUS:
1750  if (plusnest > maxnest) {
1751  maxnest = plusnest;
1752  }
1753  plusnest--;
1754  break;
1755  }
1756  } while (OP(s) != OEND);
1757  if (plusnest != 0) {
1758  g->iflags |= BAD;
1759  }
1760  return (maxnest);
1761 }
size_t len
Definition: 6502dis.c:15
#define OPND(x)
Definition: aarch64-tbl.h:33
static unsigned invert(unsigned x)
Definition: aesdata.c:73
#define e(frag)
static bool finish(void *user)
Definition: analysis_pyc.c:133
lzma_index ** i
Definition: index.h:629
static RzILOpEffect * cset(cs_insn *insn)
Definition: arm_il64.c:899
lsl lsr asr ror lsl lsr asr ror lsl lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror c1
lsl lsr asr ror lsl lsr asr ror lsl lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror lsl lsr asr ror c2
#define CHAR_BIT
Definition: readbits.h:99
static struct cclass cclasses[]
static int value
Definition: cmd_api.c:93
static struct cname cnames[]
#define RZ_API
#define NULL
Definition: cris-opc.c:27
#define r
Definition: crypto_rc6.c:12
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void nbytes
Definition: sflib.h:113
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void count
Definition: sflib.h:98
static static sync static getppid static getegid const char static filename char static len const char char static bufsiz static mask static vfork const void static prot static getpgrp const char static swapflags static arg static fd static protocol static who struct sockaddr static addrlen static backlog struct timeval struct timezone static tz const struct iovec static count static mode const void const struct sockaddr static tolen const char static pathname void static offset struct stat static buf void long static basep static whence static length const void static len static semflg const void static shmflg const struct timespec struct timespec static rem const char static group const void start
Definition: sflib.h:133
#define ut8
Definition: dcpu16.h:8
struct @667 g
#define OP(v, w, x, y, z)
unsigned char match[65280+2]
Definition: gun.c:165
RZ_API void Ht_() free(HtName_(Ht) *ht)
Definition: ht_inc.c:130
voidpf void uLong size
Definition: ioapi.h:138
uint8_t ut8
Definition: lh5801.h:11
return memset(p, 0, total)
void * p
Definition: libc.cpp:67
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
static void list(RzEgg *egg)
Definition: rz-gg.c:52
RZ_API RZ_OWN RzList * rz_list_newf(RzListFree f)
Returns a new initialized RzList pointer and sets the free method.
Definition: list.c:248
RZ_API RZ_BORROW RzListIter * rz_list_append(RZ_NONNULL RzList *list, void *data)
Appends at the end of the list a new element.
Definition: list.c:288
#define NC
Definition: utils.h:42
#define STRLCPY(x, y, z)
Definition: utils.h:44
#define DUPMAX
Definition: utils.h:39
#define INTFINITY
Definition: utils.h:41
void * realloc(void *ptr, size_t size)
Definition: malloc.c:144
void * malloc(size_t size)
Definition: malloc.c:123
void * calloc(size_t number, size_t size)
Definition: malloc.c:102
static static fork const void static count static fd const char const char static newpath char char char static envp time_t static t const char static mode static whence const char static dir time_t static t unsigned static seconds const char struct utimbuf static buf static inc static sig const char static mode static oldfd struct tms static buf static getgid static geteuid const char static filename static arg static mask struct ustat static ubuf static getppid static setsid static egid sigset_t static set struct timeval struct timezone static tz fd_set fd_set fd_set struct timeval static timeout const char char static bufsiz const char static swapflags void static offset const char static length static mode static who const char struct statfs static buf unsigned unsigned num
Definition: sflib.h:126
int n
Definition: mipsasm.c:19
static void mccase(struct parse *, cset *)
Definition: regcomp.c:1429
#define MAP(n)
#define N
static int samesets(struct re_guts *, int, int)
Definition: regcomp.c:1456
static void stripsnug(struct parse *, struct re_guts *)
Definition: regcomp.c:1630
#define NEXT2()
Definition: regcomp.c:125
#define HERE()
Definition: regcomp.c:137
RZ_API int rz_regex_flags(const char *f)
Definition: regcomp.c:197
#define INF
static char p_b_coll_elem(struct parse *, int)
Definition: regcomp.c:988
#define SEETWO(a, b)
Definition: regcomp.c:121
static void mcadd(struct parse *, cset *, char *)
Definition: regcomp.c:1391
#define GETNEXT()
Definition: regcomp.c:127
static void p_ere_exp(struct parse *)
Definition: regcomp.c:422
static sopno dupl(struct parse *, sopno, sopno)
Definition: regcomp.c:1503
#define DROP(n)
Definition: regcomp.c:140
static char othercase(int)
Definition: regcomp.c:1019
static void p_b_cclass(struct parse *, cset *)
Definition: regcomp.c:923
static void doemit(struct parse *, sop, size_t)
Definition: regcomp.c:1533
#define NPAREN
Definition: regcomp.c:66
RZ_API RzRegex * rz_regex_new(const char *pattern, const char *flags)
Definition: regcomp.c:183
static void nonnewline(struct parse *)
Definition: regcomp.c:1133
static void p_b_eclass(struct parse *, cset *)
Definition: regcomp.c:959
static int freezeset(struct parse *, cset *)
Definition: regcomp.c:1326
RZ_API void rz_regex_free(RzRegex *preg)
Definition: regcomp.c:249
#define BACKSL
#define ASTERN(sop, pos)
Definition: regcomp.c:136
#define AHEAD(pos)
Definition: regcomp.c:135
static void ordinary(struct parse *, int)
Definition: regcomp.c:1062
#define MUSTEAT(c, e)
Definition: regcomp.c:131
#define SETERROR(e)
Definition: regcomp.c:128
#define PEEK2()
Definition: regcomp.c:117
static void categorize(struct parse *, struct re_guts *)
Definition: regcomp.c:1475
static void p_str(struct parse *)
Definition: regcomp.c:573
#define GOODFLAGS(f)
static void bothcases(struct parse *, int)
Definition: regcomp.c:1038
#define INSERT(op, pos)
Definition: regcomp.c:134
static char nuls[10]
Definition: regcomp.c:110
static void repeat(struct parse *, sopno, int, int)
Definition: regcomp.c:1155
static void p_b_term(struct parse *, cset *)
Definition: regcomp.c:859
static int isinsets(struct re_guts *, int)
Definition: regcomp.c:1438
#define NEXTn(n)
Definition: regcomp.c:126
#define PEEK()
Definition: regcomp.c:116
#define SEE(c)
Definition: regcomp.c:120
RZ_API RzList * rz_regex_get_match_list(const char *pattern, const char *flags, const char *text)
Definition: regcomp.c:155
static void p_bre(struct parse *, int, int)
Definition: regcomp.c:590
#define THERE()
Definition: regcomp.c:138
static int nch(struct parse *, cset *)
Definition: regcomp.c:1375
static void doinsert(struct parse *, sop, size_t, sopno)
Definition: regcomp.c:1557
static sopno pluscount(struct parse *, struct re_guts *)
Definition: regcomp.c:1732
static void p_ere(struct parse *, int)
Definition: regcomp.c:381
RZ_API int rz_regex_comp(RzRegex *preg, const char *pattern, int cflags)
Definition: regcomp.c:258
static char p_b_symbol(struct parse *)
Definition: regcomp.c:970
#define MORE()
Definition: regcomp.c:118
static void findmust(struct parse *, struct re_guts *)
Definition: regcomp.c:1649
#define REP(f, t)
static cset * allocset(struct parse *)
Definition: regcomp.c:1242
RZ_API int rz_regex_match(const char *pattern, const char *flags, const char *text)
Definition: regcomp.c:142
static void p_bracket(struct parse *)
Definition: regcomp.c:768
static void dofwd(struct parse *, sopno, sop)
Definition: regcomp.c:1595
static int p_count(struct parse *)
Definition: regcomp.c:749
static void enlarge(struct parse *, sopno)
Definition: regcomp.c:1610
RZ_API void rz_regex_fini(RzRegex *preg)
Definition: regcomp.c:226
#define MORE2()
Definition: regcomp.c:119
#define REQUIRE(co, e)
Definition: regcomp.c:129
#define EAT(c)
Definition: regcomp.c:122
#define EMIT(op, sopnd)
Definition: regcomp.c:133
#define THERETHERE()
Definition: regcomp.c:139
static int p_simp_re(struct parse *, int)
Definition: regcomp.c:621
static void freeset(struct parse *, cset *)
Definition: regcomp.c:1303
static void mcinvert(struct parse *, cset *)
Definition: regcomp.c:1417
static int seterr(struct parse *, int)
Definition: regcomp.c:1230
#define NEXT()
Definition: regcomp.c:124
#define EATTWO(a, b)
Definition: regcomp.c:123
static void special(struct parse *, int)
Definition: regcomp.c:1076
static int firstch(struct parse *, cset *)
Definition: regcomp.c:1360
unsigned long sop
Definition: regex2.h:62
#define CHsub(cs, c)
Definition: regex2.h:114
long sopno
Definition: regex2.h:63
#define O_CH
Definition: regex2.h:89
#define OBOL
Definition: regex2.h:74
#define OCH_
Definition: regex2.h:86
#define OOR2
Definition: regex2.h:88
#define OEND
Definition: regex2.h:72
#define OCHAR
Definition: regex2.h:73
#define OQUEST_
Definition: regex2.h:82
#define OEOL
Definition: regex2.h:75
#define OLPAREN
Definition: regex2.h:84
#define OBOW
Definition: regex2.h:90
#define CHadd(cs, c)
Definition: regex2.h:113
#define OPLUS_
Definition: regex2.h:80
#define USEEOL
Definition: regex2.h:140
#define USEBOL
Definition: regex2.h:139
#define O_QUEST
Definition: regex2.h:83
#define CHIN(cs, c)
Definition: regex2.h:115
#define OANYOF
Definition: regex2.h:77
#define O_PLUS
Definition: regex2.h:81
#define SOP(op, opnd)
Definition: regex2.h:69
#define O_BACK
Definition: regex2.h:79
#define MCadd(p, cs, cp)
Definition: regex2.h:116
#define OBACK_
Definition: regex2.h:78
#define OPSHIFT
Definition: regex2.h:66
#define OEOW
Definition: regex2.h:91
#define MAGIC1
Definition: regex2.h:41
#define ORPAREN
Definition: regex2.h:85
#define MAGIC2
Definition: regex2.h:128
#define OUT
Definition: regex2.h:157
unsigned char cat_t
Definition: regex2.h:121
#define OOR1
Definition: regex2.h:87
#define OANY
Definition: regex2.h:76
#define BAD
Definition: regex2.h:141
#define eprintf(x, y...)
Definition: rlcc.c:7
static RzSocket * s
Definition: rtr.c:28
#define rz_return_val_if_fail(expr, val)
Definition: rz_assert.h:108
#define RZ_REGEX_PEND
Definition: rz_regex.h:28
#define RZ_REGEX_ESPACE
Definition: rz_regex.h:44
RZ_API int rz_regex_exec(const RzRegex *preg, const char *string, size_t nmatch, RzRegexMatch __pmatch[], int eflags)
Definition: regexec.c:149
#define RZ_REGEX_STARTEND
Definition: rz_regex.h:56
#define RZ_REGEX_ICASE
Definition: rz_regex.h:24
#define RZ_REGEX_NEWLINE
Definition: rz_regex.h:26
#define RZ_REGEX_BADBR
Definition: rz_regex.h:42
#define RZ_REGEX_EXTENDED
Definition: rz_regex.h:23
#define RZ_REGEX_ECTYPE
Definition: rz_regex.h:36
#define RZ_REGEX_EBRACK
Definition: rz_regex.h:39
#define RZ_REGEX_NOSPEC
Definition: rz_regex.h:27
#define RZ_REGEX_ECOLLATE
Definition: rz_regex.h:35
#define RZ_REGEX_ERANGE
Definition: rz_regex.h:43
#define RZ_REGEX_ESUBREG
Definition: rz_regex.h:38
#define RZ_REGEX_EBRACE
Definition: rz_regex.h:41
#define RZ_REGEX_EMPTY
Definition: rz_regex.h:46
#define RZ_REGEX_NOSUB
Definition: rz_regex.h:25
#define RZ_REGEX_BADRPT
Definition: rz_regex.h:45
#define RZ_REGEX_DUMP
Definition: rz_regex.h:29
#define RZ_REGEX_INVARG
Definition: rz_regex.h:48
#define RZ_REGEX_EPAREN
Definition: rz_regex.h:40
#define RZ_REGEX_EESCAPE
Definition: rz_regex.h:37
#define RZ_REGEX_ASSERT
Definition: rz_regex.h:47
RZ_API size_t rz_str_ncpy(char *dst, const char *src, size_t n)
Secure string copy with null terminator.
Definition: str.c:923
#define RZ_NEW(x)
Definition: rz_types.h:285
#define RZ_NEWS0(x, y)
Definition: rz_types.h:282
#define RZ_FREE(x)
Definition: rz_types.h:369
#define islower(c)
Definition: safe-ctype.h:135
#define tolower(c)
Definition: safe-ctype.h:149
#define isalpha(c)
Definition: safe-ctype.h:125
#define isdigit(c)
Definition: safe-ctype.h:131
#define isupper(c)
Definition: safe-ctype.h:143
#define toupper(c)
Definition: safe-ctype.h:147
static struct sockaddr static addrlen static backlog const void static flags void struct sockaddr from
Definition: sfsocketcall.h:123
static struct sockaddr static addrlen static backlog const void static flags void struct sockaddr socklen_t static fromlen const void const struct sockaddr to
Definition: sfsocketcall.h:125
static struct sockaddr static addrlen static backlog const void static flags void flags
Definition: sfsocketcall.h:123
int size_t
Definition: sftypes.h:40
#define f(i)
Definition: sha256.c:46
#define c(i)
Definition: sha256.c:43
#define h(i)
Definition: sha256.c:48
Definition: cclass.h:39
char * multis
Definition: cclass.h:42
char * name
Definition: cclass.h:40
char * chars
Definition: cclass.h:41
Definition: cname.h:39
char code
Definition: cname.h:41
char * name
Definition: cname.h:40
Definition: regex2.h:105
ut8 hash
Definition: regex2.h:108
Definition: zipcmp.c:77
Definition: engine.c:71
Definition: regcomp.c:57
struct re_guts * g
Definition: regcomp.c:65
sopno pend[NPAREN]
Definition: regcomp.c:68
char * next
Definition: regcomp.c:58
sopno slen
Definition: regcomp.c:63
sopno ssize
Definition: regcomp.c:62
int error
Definition: regcomp.c:60
sopno pbegin[NPAREN]
Definition: regcomp.c:67
sop * strip
Definition: regcomp.c:61
int ncsalloc
Definition: regcomp.c:64
char * end
Definition: regcomp.c:59
int cflags
Definition: regex2.h:134
const char * re_endp
Definition: rz_regex.h:11
int re_flags
Definition: rz_regex.h:13
size_t re_nsub
Definition: rz_regex.h:10
int re_magic
Definition: rz_regex.h:9
struct re_guts * re_g
Definition: rz_regex.h:12
int pos
Definition: main.c:11
ut64 maxlen
Definition: core.c:76
Definition: dis.c:32
if(dbg->bits==RZ_SYS_BITS_64)
Definition: windows-arm64.h:4
static int sp
Definition: z80asm.c:91
static int cat(char *argv[])
Definition: ziptool.c:170