Process Hacker
pcre_exec.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2010 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44 
45 #define HAVE_CONFIG_H
46 #ifdef HAVE_CONFIG_H
47 #include "config.h"
48 #endif
49 
50 #define NLBLOCK md /* Block containing newline information */
51 #define PSSTART start_subject /* Field containing processed string start */
52 #define PSEND end_subject /* Field containing processed string end */
53 
54 #include "pcre_internal.h"
55 
56 /* Undefine some potentially clashing cpp symbols */
57 
58 #undef min
59 #undef max
60 
61 /* Flag bits for the match() function */
62 
63 #define match_condassert 0x01 /* Called to check a condition assertion */
64 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68 
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71 
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74 
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_PRUNE (-997)
78 #define MATCH_SKIP (-996)
79 #define MATCH_SKIP_ARG (-995)
80 #define MATCH_THEN (-994)
81 
82 /* This is a convenience macro for code that occurs many times. */
83 
84 #define MRRETURN(ra) \
85  { \
86  md->mark = markptr; \
87  RRETURN(ra); \
88  }
89 
90 /* Maximum number of ints of offset to save on the stack for recursive calls.
91 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
92 because the offset vector is always a multiple of 3 long. */
93 
94 #define REC_STACK_SAVE_MAX 30
95 
96 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
97 
98 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
99 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
100 
101 
102 
103 #ifdef PCRE_DEBUG
104 /*************************************************
105 * Debugging function to print chars *
106 *************************************************/
107 
108 /* Print a sequence of chars in printable format, stopping at the end of the
109 subject if the requested.
110 
111 Arguments:
112  p points to characters
113  length number to print
114  is_subject TRUE if printing from within md->start_subject
115  md pointer to matching data block, if is_subject is TRUE
116 
117 Returns: nothing
118 */
119 
120 static void
121 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
122 {
123 unsigned int c;
124 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
125 while (length-- > 0)
126  if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
127 }
128 #endif
129 
130 
131 
132 /*************************************************
133 * Match a back-reference *
134 *************************************************/
135 
136 /* If a back reference hasn't been set, the length that is passed is greater
137 than the number of characters left in the string, so the match fails.
138 
139 Arguments:
140  offset index into the offset vector
141  eptr points into the subject
142  length length to be matched
143  md points to match data block
144  ims the ims flags
145 
146 Returns: TRUE if matched
147 */
148 
149 static BOOL
150 match_ref(int offset, register USPTR eptr, int length, match_data *md,
151  unsigned long int ims)
152 {
153 USPTR p = md->start_subject + md->offset_vector[offset];
154 
155 #ifdef PCRE_DEBUG
156 if (eptr >= md->end_subject)
157  printf("matching subject <null>");
158 else
159  {
160  printf("matching subject ");
161  pchars(eptr, length, TRUE, md);
162  }
163 printf(" against backref ");
164 pchars(p, length, FALSE, md);
165 printf("\n");
166 #endif
167 
168 /* Always fail if not enough characters left */
169 
170 if (length > md->end_subject - eptr) return FALSE;
171 
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175 
176 if ((ims & PCRE_CASELESS) != 0)
177  {
178 #ifdef SUPPORT_UTF8
179 #ifdef SUPPORT_UCP
180  if (md->utf8)
181  {
182  USPTR endptr = eptr + length;
183  while (eptr < endptr)
184  {
185  int c, d;
186  GETCHARINC(c, eptr);
187  GETCHARINC(d, p);
188  if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
189  }
190  }
191  else
192 #endif
193 #endif
194 
195  /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
196  is no UCP support. */
197 
198  while (length-- > 0)
199  { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
200  }
201 
202 /* In the caseful case, we can just compare the bytes, whether or not we
203 are in UTF-8 mode. */
204 
205 else
206  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
207 
208 return TRUE;
209 }
210 
211 
212 
213 /***************************************************************************
214 ****************************************************************************
215  RECURSION IN THE match() FUNCTION
216 
217 The match() function is highly recursive, though not every recursive call
218 increases the recursive depth. Nevertheless, some regular expressions can cause
219 it to recurse to a great depth. I was writing for Unix, so I just let it call
220 itself recursively. This uses the stack for saving everything that has to be
221 saved for a recursive call. On Unix, the stack can be large, and this works
222 fine.
223 
224 It turns out that on some non-Unix-like systems there are problems with
225 programs that use a lot of stack. (This despite the fact that every last chip
226 has oodles of memory these days, and techniques for extending the stack have
227 been known for decades.) So....
228 
229 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
230 calls by keeping local variables that need to be preserved in blocks of memory
231 obtained from malloc() instead instead of on the stack. Macros are used to
232 achieve this so that the actual code doesn't look very different to what it
233 always used to.
234 
235 The original heap-recursive code used longjmp(). However, it seems that this
236 can be very slow on some operating systems. Following a suggestion from Stan
237 Switzer, the use of longjmp() has been abolished, at the cost of having to
238 provide a unique number for each call to RMATCH. There is no way of generating
239 a sequence of numbers at compile time in C. I have given them names, to make
240 them stand out more clearly.
241 
242 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
243 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
244 tests. Furthermore, not using longjmp() means that local dynamic variables
245 don't have indeterminate values; this has meant that the frame size can be
246 reduced because the result can be "passed back" by straight setting of the
247 variable instead of being passed in the frame.
248 ****************************************************************************
249 ***************************************************************************/
250 
251 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
252 below must be updated in sync. */
253 
254 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
260  RM61, RM62 };
261 
262 /* These versions of the macros use the stack, as normal. There are debugging
263 versions and production versions. Note that the "rw" argument of RMATCH isn't
264 actually used in this definition. */
265 
266 #ifndef NO_RECURSE
267 #define REGISTER register
268 
269 #ifdef PCRE_DEBUG
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271  { \
272  printf("match() called in line %d\n", __LINE__); \
273  rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
274  printf("to line %d\n", __LINE__); \
275  }
276 #define RRETURN(ra) \
277  { \
278  printf("match() returned %d from line %d ", ra, __LINE__); \
279  return ra; \
280  }
281 #else
282 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
283  rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
284 #define RRETURN(ra) return ra
285 #endif
286 
287 #else
288 
289 
290 /* These versions of the macros manage a private stack on the heap. Note that
291 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
292 argument of match(), which never changes. */
293 
294 #define REGISTER
295 
296 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
297  {\
298  heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
299  if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
300  frame->Xwhere = rw; \
301  newframe->Xeptr = ra;\
302  newframe->Xecode = rb;\
303  newframe->Xmstart = mstart;\
304  newframe->Xmarkptr = markptr;\
305  newframe->Xoffset_top = rc;\
306  newframe->Xims = re;\
307  newframe->Xeptrb = rf;\
308  newframe->Xflags = rg;\
309  newframe->Xrdepth = frame->Xrdepth + 1;\
310  newframe->Xprevframe = frame;\
311  frame = newframe;\
312  DPRINTF(("restarting from line %d\n", __LINE__));\
313  goto HEAP_RECURSE;\
314  L_##rw:\
315  DPRINTF(("jumped back to line %d\n", __LINE__));\
316  }
317 
318 #define RRETURN(ra)\
319  {\
320  heapframe *oldframe = frame;\
321  frame = oldframe->Xprevframe;\
322  (pcre_stack_free)(oldframe);\
323  if (frame != NULL)\
324  {\
325  rrc = ra;\
326  goto HEAP_RETURN;\
327  }\
328  return ra;\
329  }
330 
331 
332 /* Structure for remembering the local variables in a private frame */
333 
334 typedef struct heapframe {
335  struct heapframe *Xprevframe;
336 
337  /* Function arguments that may change */
338 
339  USPTR Xeptr;
340  const uschar *Xecode;
341  USPTR Xmstart;
342  USPTR Xmarkptr;
343  int Xoffset_top;
344  long int Xims;
345  eptrblock *Xeptrb;
346  int Xflags;
347  unsigned int Xrdepth;
348 
349  /* Function local variables */
350 
351  USPTR Xcallpat;
352 #ifdef SUPPORT_UTF8
353  USPTR Xcharptr;
354 #endif
355  USPTR Xdata;
356  USPTR Xnext;
357  USPTR Xpp;
358  USPTR Xprev;
359  USPTR Xsaved_eptr;
360 
361  recursion_info Xnew_recursive;
362 
363  BOOL Xcur_is_word;
364  BOOL Xcondition;
365  BOOL Xprev_is_word;
366 
367  unsigned long int Xoriginal_ims;
368 
369 #ifdef SUPPORT_UCP
370  int Xprop_type;
371  int Xprop_value;
372  int Xprop_fail_result;
373  int Xprop_category;
374  int Xprop_chartype;
375  int Xprop_script;
376  int Xoclength;
377  uschar Xocchars[8];
378 #endif
379 
380  int Xcodelink;
381  int Xctype;
382  unsigned int Xfc;
383  int Xfi;
384  int Xlength;
385  int Xmax;
386  int Xmin;
387  int Xnumber;
388  int Xoffset;
389  int Xop;
390  int Xsave_capture_last;
391  int Xsave_offset1, Xsave_offset2, Xsave_offset3;
392  int Xstacksave[REC_STACK_SAVE_MAX];
393 
394  eptrblock Xnewptrb;
395 
396  /* Where to jump back to */
397 
398  int Xwhere;
399 
400 } heapframe;
401 
402 #endif
403 
404 
405 /***************************************************************************
406 ***************************************************************************/
407 
408 
409 
410 /*************************************************
411 * Match from current position *
412 *************************************************/
413 
414 /* This function is called recursively in many circumstances. Whenever it
415 returns a negative (error) response, the outer incarnation must also return the
416 same response. */
417 
418 /* These macros pack up tests that are used for partial matching, and which
419 appears several times in the code. We set the "hit end" flag if the pointer is
420 at the end of the subject and also past the start of the subject (i.e.
421 something has been matched). For hard partial matching, we then return
422 immediately. The second one is used when we already know we are past the end of
423 the subject. */
424 
425 #define CHECK_PARTIAL()\
426  if (md->partial != 0 && eptr >= md->end_subject && \
427  eptr > md->start_used_ptr) \
428  { \
429  md->hitend = TRUE; \
430  if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
431  }
432 
433 #define SCHECK_PARTIAL()\
434  if (md->partial != 0 && eptr > md->start_used_ptr) \
435  { \
436  md->hitend = TRUE; \
437  if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
438  }
439 
440 
441 /* Performance note: It might be tempting to extract commonly used fields from
442 the md structure (e.g. utf8, end_subject) into individual variables to improve
443 performance. Tests using gcc on a SPARC disproved this; in the first case, it
444 made performance worse.
445 
446 Arguments:
447  eptr pointer to current character in subject
448  ecode pointer to current position in compiled code
449  mstart pointer to the current match start position (can be modified
450  by encountering \K)
451  markptr pointer to the most recent MARK name, or NULL
452  offset_top current top pointer
453  md pointer to "static" info for the match
454  ims current /i, /m, and /s options
455  eptrb pointer to chain of blocks containing eptr at start of
456  brackets - for testing for empty matches
457  flags can contain
458  match_condassert - this is an assertion condition
459  match_cbegroup - this is the start of an unlimited repeat
460  group that can match an empty string
461  rdepth the recursion depth
462 
463 Returns: MATCH_MATCH if matched ) these values are >= 0
464  MATCH_NOMATCH if failed to match )
465  a negative MATCH_xxx value for PRUNE, SKIP, etc
466  a negative PCRE_ERROR_xxx value if aborted by an error condition
467  (e.g. stopped by repeated call or recursion limit)
468 */
469 
470 static int
471 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
472  const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
473  eptrblock *eptrb, int flags, unsigned int rdepth)
474 {
475 /* These variables do not need to be preserved over recursion in this function,
476 so they can be ordinary variables in all cases. Mark some of them with
477 "register" because they are used a lot in loops. */
478 
479 register int rrc; /* Returns from recursive calls */
480 register int i; /* Used for loops not involving calls to RMATCH() */
481 register unsigned int c; /* Character values not kept over RMATCH() calls */
482 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
483 
484 BOOL minimize, possessive; /* Quantifier options */
485 int condcode;
486 
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491 
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496 
497 /* Copy in the original argument variables */
498 
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xmarkptr = markptr;
503 frame->Xoffset_top = offset_top;
504 frame->Xims = ims;
505 frame->Xeptrb = eptrb;
506 frame->Xflags = flags;
507 frame->Xrdepth = rdepth;
508 
509 /* This is where control jumps back to to effect "recursion" */
510 
511 HEAP_RECURSE:
512 
513 /* Macros make the argument variables come from the current frame */
514 
515 #define eptr frame->Xeptr
516 #define ecode frame->Xecode
517 #define mstart frame->Xmstart
518 #define markptr frame->Xmarkptr
519 #define offset_top frame->Xoffset_top
520 #define ims frame->Xims
521 #define eptrb frame->Xeptrb
522 #define flags frame->Xflags
523 #define rdepth frame->Xrdepth
524 
525 /* Ditto for the local variables */
526 
527 #ifdef SUPPORT_UTF8
528 #define charptr frame->Xcharptr
529 #endif
530 #define callpat frame->Xcallpat
531 #define codelink frame->Xcodelink
532 #define data frame->Xdata
533 #define next frame->Xnext
534 #define pp frame->Xpp
535 #define prev frame->Xprev
536 #define saved_eptr frame->Xsaved_eptr
537 
538 #define new_recursive frame->Xnew_recursive
539 
540 #define cur_is_word frame->Xcur_is_word
541 #define condition frame->Xcondition
542 #define prev_is_word frame->Xprev_is_word
543 
544 #define original_ims frame->Xoriginal_ims
545 
546 #ifdef SUPPORT_UCP
547 #define prop_type frame->Xprop_type
548 #define prop_value frame->Xprop_value
549 #define prop_fail_result frame->Xprop_fail_result
550 #define prop_category frame->Xprop_category
551 #define prop_chartype frame->Xprop_chartype
552 #define prop_script frame->Xprop_script
553 #define oclength frame->Xoclength
554 #define occhars frame->Xocchars
555 #endif
556 
557 #define ctype frame->Xctype
558 #define fc frame->Xfc
559 #define fi frame->Xfi
560 #define length frame->Xlength
561 #define max frame->Xmax
562 #define min frame->Xmin
563 #define number frame->Xnumber
564 #define offset frame->Xoffset
565 #define op frame->Xop
566 #define save_capture_last frame->Xsave_capture_last
567 #define save_offset1 frame->Xsave_offset1
568 #define save_offset2 frame->Xsave_offset2
569 #define save_offset3 frame->Xsave_offset3
570 #define stacksave frame->Xstacksave
571 
572 #define newptrb frame->Xnewptrb
573 
574 /* When recursion is being used, local variables are allocated on the stack and
575 get preserved during recursion in the normal way. In this environment, fi and
576 i, and fc and c, can be the same variables. */
577 
578 #else /* NO_RECURSE not defined */
579 #define fi i
580 #define fc c
581 
582 
583 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
584 const uschar *charptr; /* in small blocks of the code. My normal */
585 #endif /* style of coding would have declared */
586 const uschar *callpat; /* them within each of those blocks. */
587 const uschar *data; /* However, in order to accommodate the */
588 const uschar *next; /* version of this code that uses an */
589 USPTR pp; /* external "stack" implemented on the */
590 const uschar *prev; /* heap, it is easier to declare them all */
591 USPTR saved_eptr; /* here, so the declarations can be cut */
592  /* out in a block. The only declarations */
593 recursion_info new_recursive; /* within blocks below are for variables */
594  /* that do not have to be preserved over */
595 BOOL cur_is_word; /* a recursive call to RMATCH(). */
596 BOOL condition;
597 BOOL prev_is_word;
598 
599 unsigned long int original_ims;
600 
601 #ifdef SUPPORT_UCP
602 int prop_type;
603 int prop_value;
604 int prop_fail_result;
605 int prop_category;
606 int prop_chartype;
607 int prop_script;
608 int oclength;
609 uschar occhars[8];
610 #endif
611 
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623 
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626 
627 /* These statements are here to stop the compiler complaining about unitialized
628 variables. */
629 
630 #ifdef SUPPORT_UCP
631 prop_value = 0;
632 prop_fail_result = 0;
633 #endif
634 
635 
636 /* This label is used for tail recursion, which is used in a few cases even
637 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
638 used. Thanks to Ian Taylor for noticing this possibility and sending the
639 original patch. */
640 
641 TAIL_RECURSE:
642 
643 /* OK, now we can get on with the real code of the function. Recursive calls
644 are specified by the macro RMATCH and RRETURN is used to return. When
645 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
646 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
647 defined). However, RMATCH isn't like a function call because it's quite a
648 complicated macro. It has to be used in one particular way. This shouldn't,
649 however, impact performance when true recursion is being used. */
650 
651 #ifdef SUPPORT_UTF8
652 utf8 = md->utf8; /* Local copy of the flag */
653 #else
654 utf8 = FALSE;
655 #endif
656 
657 /* First check that we haven't called match() too many times, or that we
658 haven't exceeded the recursive call limit. */
659 
662 
663 original_ims = ims; /* Save for resetting on ')' */
664 
665 /* At the start of a group with an unlimited repeat that may match an empty
666 string, the match_cbegroup flag is set. When this is the case, add the current
667 subject pointer to the chain of such remembered pointers, to be checked when we
668 hit the closing ket, in order to break infinite loops that match no characters.
669 When match() is called in other circumstances, don't add to the chain. The
670 match_cbegroup flag must NOT be used with tail recursion, because the memory
671 block that is used is on the stack, so a new one may be required for each
672 match(). */
673 
674 if ((flags & match_cbegroup) != 0)
675  {
676  newptrb.epb_saved_eptr = eptr;
677  newptrb.epb_prev = eptrb;
678  eptrb = &newptrb;
679  }
680 
681 /* Now start processing the opcodes. */
682 
683 for (;;)
684  {
685  minimize = possessive = FALSE;
686  op = *ecode;
687 
688  switch(op)
689  {
690  case OP_MARK:
691  markptr = ecode + 2;
692  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
693  ims, eptrb, flags, RM55);
694 
695  /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
696  argument, and we must check whether that argument matches this MARK's
697  argument. It is passed back in md->start_match_ptr (an overloading of that
698  variable). If it does match, we reset that variable to the current subject
699  position and return MATCH_SKIP. Otherwise, pass back the return code
700  unaltered. */
701 
702  if (rrc == MATCH_SKIP_ARG &&
703  strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
704  {
705  md->start_match_ptr = eptr;
707  }
708 
709  if (md->mark == NULL) md->mark = markptr;
710  RRETURN(rrc);
711 
712  case OP_FAIL:
714 
715  /* COMMIT overrides PRUNE, SKIP, and THEN */
716 
717  case OP_COMMIT:
718  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719  ims, eptrb, flags, RM52);
720  if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
721  rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
722  rrc != MATCH_THEN)
723  RRETURN(rrc);
725 
726  /* PRUNE overrides THEN */
727 
728  case OP_PRUNE:
729  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
730  ims, eptrb, flags, RM51);
731  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
733 
734  case OP_PRUNE_ARG:
735  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
736  ims, eptrb, flags, RM56);
737  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738  md->mark = ecode + 2;
740 
741  /* SKIP overrides PRUNE and THEN */
742 
743  case OP_SKIP:
744  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
745  ims, eptrb, flags, RM53);
746  if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
747  RRETURN(rrc);
748  md->start_match_ptr = eptr; /* Pass back current position */
750 
751  case OP_SKIP_ARG:
752  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
753  ims, eptrb, flags, RM57);
754  if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
755  RRETURN(rrc);
756 
757  /* Pass back the current skip name by overloading md->start_match_ptr and
758  returning the special MATCH_SKIP_ARG return code. This will either be
759  caught by a matching MARK, or get to the top, where it is treated the same
760  as PRUNE. */
761 
762  md->start_match_ptr = ecode + 2;
764 
765  /* For THEN (and THEN_ARG) we pass back the address of the bracket or
766  the alt that is at the start of the current branch. This makes it possible
767  to skip back past alternatives that precede the THEN within the current
768  branch. */
769 
770  case OP_THEN:
771  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
772  ims, eptrb, flags, RM54);
773  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
774  md->start_match_ptr = ecode - GET(ecode, 1);
776 
777  case OP_THEN_ARG:
778  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
779  offset_top, md, ims, eptrb, flags, RM58);
780  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
781  md->start_match_ptr = ecode - GET(ecode, 1);
782  md->mark = ecode + LINK_SIZE + 2;
784 
785  /* Handle a capturing bracket. If there is space in the offset vector, save
786  the current subject position in the working slot at the top of the vector.
787  We mustn't change the current values of the data slot, because they may be
788  set from a previous iteration of this group, and be referred to by a
789  reference inside the group.
790 
791  If the bracket fails to match, we need to restore this value and also the
792  values of the final offsets, in case they were set by a previous iteration
793  of the same bracket.
794 
795  If there isn't enough space in the offset vector, treat this as if it were
796  a non-capturing bracket. Don't worry about setting the flag for the error
797  case here; that is handled in the code for KET. */
798 
799  case OP_CBRA:
800  case OP_SCBRA:
801  number = GET2(ecode, 1+LINK_SIZE);
802  offset = number << 1;
803 
804 #ifdef PCRE_DEBUG
805  printf("start bracket %d\n", number);
806  printf("subject=");
807  pchars(eptr, 16, TRUE, md);
808  printf("\n");
809 #endif
810 
811  if (offset < md->offset_max)
812  {
813  save_offset1 = md->offset_vector[offset];
814  save_offset2 = md->offset_vector[offset+1];
815  save_offset3 = md->offset_vector[md->offset_end - number];
816  save_capture_last = md->capture_last;
817 
818  DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
819  md->offset_vector[md->offset_end - number] =
820  (int)(eptr - md->start_subject);
821 
822  flags = (op == OP_SCBRA)? match_cbegroup : 0;
823  do
824  {
825  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
826  ims, eptrb, flags, RM1);
827  if (rrc != MATCH_NOMATCH &&
828  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
829  RRETURN(rrc);
830  md->capture_last = save_capture_last;
831  ecode += GET(ecode, 1);
832  }
833  while (*ecode == OP_ALT);
834 
835  DPRINTF(("bracket %d failed\n", number));
836 
837  md->offset_vector[offset] = save_offset1;
838  md->offset_vector[offset+1] = save_offset2;
839  md->offset_vector[md->offset_end - number] = save_offset3;
840 
841  if (rrc != MATCH_THEN) md->mark = markptr;
843  }
844 
845  /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
846  as a non-capturing bracket. */
847 
848  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
849  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
850 
851  DPRINTF(("insufficient capture room: treat as non-capturing\n"));
852 
853  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
854  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
855 
856  /* Non-capturing bracket. Loop for all the alternatives. When we get to the
857  final alternative within the brackets, we would return the result of a
858  recursive call to match() whatever happened. We can reduce stack usage by
859  turning this into a tail recursion, except in the case when match_cbegroup
860  is set.*/
861 
862  case OP_BRA:
863  case OP_SBRA:
864  DPRINTF(("start non-capturing bracket\n"));
865  flags = (op >= OP_SBRA)? match_cbegroup : 0;
866  for (;;)
867  {
868  if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
869  {
870  if (flags == 0) /* Not a possibly empty group */
871  {
872  ecode += _pcre_OP_lengths[*ecode];
873  DPRINTF(("bracket 0 tail recursion\n"));
874  goto TAIL_RECURSE;
875  }
876 
877  /* Possibly empty group; can't use tail recursion. */
878 
879  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
880  eptrb, flags, RM48);
881  if (rrc == MATCH_NOMATCH) md->mark = markptr;
882  RRETURN(rrc);
883  }
884 
885  /* For non-final alternatives, continue the loop for a NOMATCH result;
886  otherwise return. */
887 
888  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
889  eptrb, flags, RM2);
890  if (rrc != MATCH_NOMATCH &&
891  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
892  RRETURN(rrc);
893  ecode += GET(ecode, 1);
894  }
895  /* Control never reaches here. */
896 
897  /* Conditional group: compilation checked that there are no more than
898  two branches. If the condition is false, skipping the first branch takes us
899  past the end if there is only one branch, but that's OK because that is
900  exactly what going to the ket would do. As there is only one branch to be
901  obeyed, we can use tail recursion to avoid using another stack frame. */
902 
903  case OP_COND:
904  case OP_SCOND:
905  codelink= GET(ecode, 1);
906 
907  /* Because of the way auto-callout works during compile, a callout item is
908  inserted between OP_COND and an assertion condition. */
909 
910  if (ecode[LINK_SIZE+1] == OP_CALLOUT)
911  {
912  if (pcre_callout != NULL)
913  {
915  cb.version = 1; /* Version 1 of the callout block */
916  cb.callout_number = ecode[LINK_SIZE+2];
917  cb.offset_vector = md->offset_vector;
918  cb.subject = (PCRE_SPTR)md->start_subject;
919  cb.subject_length = (int)(md->end_subject - md->start_subject);
920  cb.start_match = (int)(mstart - md->start_subject);
921  cb.current_position = (int)(eptr - md->start_subject);
922  cb.pattern_position = GET(ecode, LINK_SIZE + 3);
923  cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
924  cb.capture_top = offset_top/2;
925  cb.capture_last = md->capture_last;
926  cb.callout_data = md->callout_data;
927  if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
928  if (rrc < 0) RRETURN(rrc);
929  }
930  ecode += _pcre_OP_lengths[OP_CALLOUT];
931  }
932 
933  condcode = ecode[LINK_SIZE+1];
934 
935  /* Now see what the actual condition is */
936 
937  if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
938  {
939  if (md->recursive == NULL) /* Not recursing => FALSE */
940  {
941  condition = FALSE;
942  ecode += GET(ecode, 1);
943  }
944  else
945  {
946  int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
947  condition = (recno == RREF_ANY || recno == md->recursive->group_num);
948 
949  /* If the test is for recursion into a specific subpattern, and it is
950  false, but the test was set up by name, scan the table to see if the
951  name refers to any other numbers, and test them. The condition is true
952  if any one is set. */
953 
954  if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
955  {
956  uschar *slotA = md->name_table;
957  for (i = 0; i < md->name_count; i++)
958  {
959  if (GET2(slotA, 0) == recno) break;
960  slotA += md->name_entry_size;
961  }
962 
963  /* Found a name for the number - there can be only one; duplicate
964  names for different numbers are allowed, but not vice versa. First
965  scan down for duplicates. */
966 
967  if (i < md->name_count)
968  {
969  uschar *slotB = slotA;
970  while (slotB > md->name_table)
971  {
972  slotB -= md->name_entry_size;
973  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
974  {
975  condition = GET2(slotB, 0) == md->recursive->group_num;
976  if (condition) break;
977  }
978  else break;
979  }
980 
981  /* Scan up for duplicates */
982 
983  if (!condition)
984  {
985  slotB = slotA;
986  for (i++; i < md->name_count; i++)
987  {
988  slotB += md->name_entry_size;
989  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
990  {
991  condition = GET2(slotB, 0) == md->recursive->group_num;
992  if (condition) break;
993  }
994  else break;
995  }
996  }
997  }
998  }
999 
1000  /* Chose branch according to the condition */
1001 
1002  ecode += condition? 3 : GET(ecode, 1);
1003  }
1004  }
1005 
1006  else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1007  {
1008  offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1009  condition = offset < offset_top && md->offset_vector[offset] >= 0;
1010 
1011  /* If the numbered capture is unset, but the reference was by name,
1012  scan the table to see if the name refers to any other numbers, and test
1013  them. The condition is true if any one is set. This is tediously similar
1014  to the code above, but not close enough to try to amalgamate. */
1015 
1016  if (!condition && condcode == OP_NCREF)
1017  {
1018  int refno = offset >> 1;
1019  uschar *slotA = md->name_table;
1020 
1021  for (i = 0; i < md->name_count; i++)
1022  {
1023  if (GET2(slotA, 0) == refno) break;
1024  slotA += md->name_entry_size;
1025  }
1026 
1027  /* Found a name for the number - there can be only one; duplicate names
1028  for different numbers are allowed, but not vice versa. First scan down
1029  for duplicates. */
1030 
1031  if (i < md->name_count)
1032  {
1033  uschar *slotB = slotA;
1034  while (slotB > md->name_table)
1035  {
1036  slotB -= md->name_entry_size;
1037  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1038  {
1039  offset = GET2(slotB, 0) << 1;
1040  condition = offset < offset_top &&
1041  md->offset_vector[offset] >= 0;
1042  if (condition) break;
1043  }
1044  else break;
1045  }
1046 
1047  /* Scan up for duplicates */
1048 
1049  if (!condition)
1050  {
1051  slotB = slotA;
1052  for (i++; i < md->name_count; i++)
1053  {
1054  slotB += md->name_entry_size;
1055  if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1056  {
1057  offset = GET2(slotB, 0) << 1;
1058  condition = offset < offset_top &&
1059  md->offset_vector[offset] >= 0;
1060  if (condition) break;
1061  }
1062  else break;
1063  }
1064  }
1065  }
1066  }
1067 
1068  /* Chose branch according to the condition */
1069 
1070  ecode += condition? 3 : GET(ecode, 1);
1071  }
1072 
1073  else if (condcode == OP_DEF) /* DEFINE - always false */
1074  {
1075  condition = FALSE;
1076  ecode += GET(ecode, 1);
1077  }
1078 
1079  /* The condition is an assertion. Call match() to evaluate it - setting
1080  the final argument match_condassert causes it to stop at the end of an
1081  assertion. */
1082 
1083  else
1084  {
1085  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1087  if (rrc == MATCH_MATCH)
1088  {
1089  condition = TRUE;
1090  ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1091  while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1092  }
1093  else if (rrc != MATCH_NOMATCH &&
1094  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1095  {
1096  RRETURN(rrc); /* Need braces because of following else */
1097  }
1098  else
1099  {
1100  condition = FALSE;
1101  ecode += codelink;
1102  }
1103  }
1104 
1105  /* We are now at the branch that is to be obeyed. As there is only one,
1106  we can use tail recursion to avoid using another stack frame, except when
1107  match_cbegroup is required for an unlimited repeat of a possibly empty
1108  group. If the second alternative doesn't exist, we can just plough on. */
1109 
1110  if (condition || *ecode == OP_ALT)
1111  {
1112  ecode += 1 + LINK_SIZE;
1113  if (op == OP_SCOND) /* Possibly empty group */
1114  {
1115  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1116  RRETURN(rrc);
1117  }
1118  else /* Group must match something */
1119  {
1120  flags = 0;
1121  goto TAIL_RECURSE;
1122  }
1123  }
1124  else /* Condition false & no alternative */
1125  {
1126  ecode += 1 + LINK_SIZE;
1127  }
1128  break;
1129 
1130 
1131  /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1132  to close any currently open capturing brackets. */
1133 
1134  case OP_CLOSE:
1135  number = GET2(ecode, 1);
1136  offset = number << 1;
1137 
1138 #ifdef PCRE_DEBUG
1139  printf("end bracket %d at *ACCEPT", number);
1140  printf("\n");
1141 #endif
1142 
1143  md->capture_last = number;
1144  if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1145  {
1146  md->offset_vector[offset] =
1147  md->offset_vector[md->offset_end - number];
1148  md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1149  if (offset_top <= offset) offset_top = offset + 2;
1150  }
1151  ecode += 3;
1152  break;
1153 
1154 
1155  /* End of the pattern, either real or forced. If we are in a top-level
1156  recursion, we should restore the offsets appropriately and continue from
1157  after the call. */
1158 
1159  case OP_ACCEPT:
1160  case OP_END:
1161  if (md->recursive != NULL && md->recursive->group_num == 0)
1162  {
1163  recursion_info *rec = md->recursive;
1164  DPRINTF(("End of pattern in a (?0) recursion\n"));
1165  md->recursive = rec->prevrec;
1166  memmove(md->offset_vector, rec->offset_save,
1167  rec->saved_max * sizeof(int));
1168  offset_top = rec->save_offset_top;
1169  ims = original_ims;
1170  ecode = rec->after_call;
1171  break;
1172  }
1173 
1174  /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1175  set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1176  the subject. In both cases, backtracking will then try other alternatives,
1177  if any. */
1178 
1179  if (eptr == mstart &&
1180  (md->notempty ||
1181  (md->notempty_atstart &&
1182  mstart == md->start_subject + md->start_offset)))
1184 
1185  /* Otherwise, we have a match. */
1186 
1187  md->end_match_ptr = eptr; /* Record where we ended */
1188  md->end_offset_top = offset_top; /* and how many extracts were taken */
1189  md->start_match_ptr = mstart; /* and the start (\K can modify) */
1190 
1191  /* For some reason, the macros don't work properly if an expression is
1192  given as the argument to MRRETURN when the heap is in use. */
1193 
1194  rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1195  MRRETURN(rrc);
1196 
1197  /* Change option settings */
1198 
1199  case OP_OPT:
1200  ims = ecode[1];
1201  ecode += 2;
1202  DPRINTF(("ims set to %02lx\n", ims));
1203  break;
1204 
1205  /* Assertion brackets. Check the alternative branches in turn - the
1206  matching won't pass the KET for an assertion. If any one branch matches,
1207  the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1208  start of each branch to move the current point backwards, so the code at
1209  this level is identical to the lookahead case. */
1210 
1211  case OP_ASSERT:
1212  case OP_ASSERTBACK:
1213  do
1214  {
1215  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1216  RM4);
1217  if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1218  {
1219  mstart = md->start_match_ptr; /* In case \K reset it */
1220  break;
1221  }
1222  if (rrc != MATCH_NOMATCH &&
1223  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1224  RRETURN(rrc);
1225  ecode += GET(ecode, 1);
1226  }
1227  while (*ecode == OP_ALT);
1228  if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1229 
1230  /* If checking an assertion for a condition, return MATCH_MATCH. */
1231 
1232  if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1233 
1234  /* Continue from after the assertion, updating the offsets high water
1235  mark, since extracts may have been taken during the assertion. */
1236 
1237  do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1238  ecode += 1 + LINK_SIZE;
1239  offset_top = md->end_offset_top;
1240  continue;
1241 
1242  /* Negative assertion: all branches must fail to match. Encountering SKIP,
1243  PRUNE, or COMMIT means we must assume failure without checking subsequent
1244  branches. */
1245 
1246  case OP_ASSERT_NOT:
1247  case OP_ASSERTBACK_NOT:
1248  do
1249  {
1250  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1251  RM5);
1252  if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1253  if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1254  {
1255  do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1256  break;
1257  }
1258  if (rrc != MATCH_NOMATCH &&
1259  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1260  RRETURN(rrc);
1261  ecode += GET(ecode,1);
1262  }
1263  while (*ecode == OP_ALT);
1264 
1265  if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1266 
1267  ecode += 1 + LINK_SIZE;
1268  continue;
1269 
1270  /* Move the subject pointer back. This occurs only at the start of
1271  each branch of a lookbehind assertion. If we are too close to the start to
1272  move back, this match function fails. When working with UTF-8 we move
1273  back a number of characters, not bytes. */
1274 
1275  case OP_REVERSE:
1276 #ifdef SUPPORT_UTF8
1277  if (utf8)
1278  {
1279  i = GET(ecode, 1);
1280  while (i-- > 0)
1281  {
1282  eptr--;
1283  if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1284  BACKCHAR(eptr);
1285  }
1286  }
1287  else
1288 #endif
1289 
1290  /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1291 
1292  {
1293  eptr -= GET(ecode, 1);
1294  if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1295  }
1296 
1297  /* Save the earliest consulted character, then skip to next op code */
1298 
1299  if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1300  ecode += 1 + LINK_SIZE;
1301  break;
1302 
1303  /* The callout item calls an external function, if one is provided, passing
1304  details of the match so far. This is mainly for debugging, though the
1305  function is able to force a failure. */
1306 
1307  case OP_CALLOUT:
1308  if (pcre_callout != NULL)
1309  {
1310  pcre_callout_block cb;
1311  cb.version = 1; /* Version 1 of the callout block */
1312  cb.callout_number = ecode[1];
1313  cb.offset_vector = md->offset_vector;
1314  cb.subject = (PCRE_SPTR)md->start_subject;
1315  cb.subject_length = (int)(md->end_subject - md->start_subject);
1316  cb.start_match = (int)(mstart - md->start_subject);
1317  cb.current_position = (int)(eptr - md->start_subject);
1318  cb.pattern_position = GET(ecode, 2);
1319  cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1320  cb.capture_top = offset_top/2;
1321  cb.capture_last = md->capture_last;
1322  cb.callout_data = md->callout_data;
1323  if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1324  if (rrc < 0) RRETURN(rrc);
1325  }
1326  ecode += 2 + 2*LINK_SIZE;
1327  break;
1328 
1329  /* Recursion either matches the current regex, or some subexpression. The
1330  offset data is the offset to the starting bracket from the start of the
1331  whole pattern. (This is so that it works from duplicated subpatterns.)
1332 
1333  If there are any capturing brackets started but not finished, we have to
1334  save their starting points and reinstate them after the recursion. However,
1335  we don't know how many such there are (offset_top records the completed
1336  total) so we just have to save all the potential data. There may be up to
1337  65535 such values, which is too large to put on the stack, but using malloc
1338  for small numbers seems expensive. As a compromise, the stack is used when
1339  there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1340  is used. A problem is what to do if the malloc fails ... there is no way of
1341  returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1342  values on the stack, and accept that the rest may be wrong.
1343 
1344  There are also other values that have to be saved. We use a chained
1345  sequence of blocks that actually live on the stack. Thanks to Robin Houston
1346  for the original version of this logic. */
1347 
1348  case OP_RECURSE:
1349  {
1350  callpat = md->start_code + GET(ecode, 1);
1351  new_recursive.group_num = (callpat == md->start_code)? 0 :
1352  GET2(callpat, 1 + LINK_SIZE);
1353 
1354  /* Add to "recursing stack" */
1355 
1356  new_recursive.prevrec = md->recursive;
1357  md->recursive = &new_recursive;
1358 
1359  /* Find where to continue from afterwards */
1360 
1361  ecode += 1 + LINK_SIZE;
1362  new_recursive.after_call = ecode;
1363 
1364  /* Now save the offset data. */
1365 
1366  new_recursive.saved_max = md->offset_end;
1367  if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1368  new_recursive.offset_save = stacksave;
1369  else
1370  {
1371  new_recursive.offset_save =
1372  (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1373  if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1374  }
1375 
1376  memcpy(new_recursive.offset_save, md->offset_vector,
1377  new_recursive.saved_max * sizeof(int));
1378  new_recursive.save_offset_top = offset_top;
1379 
1380  /* OK, now we can do the recursion. For each top-level alternative we
1381  restore the offset and recursion data. */
1382 
1383  DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1384  flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1385  do
1386  {
1387  RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1388  md, ims, eptrb, flags, RM6);
1389  if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1390  {
1391  DPRINTF(("Recursion matched\n"));
1392  md->recursive = new_recursive.prevrec;
1393  if (new_recursive.offset_save != stacksave)
1394  (pcre_free)(new_recursive.offset_save);
1396  }
1397  else if (rrc != MATCH_NOMATCH &&
1398  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1399  {
1400  DPRINTF(("Recursion gave error %d\n", rrc));
1401  if (new_recursive.offset_save != stacksave)
1402  (pcre_free)(new_recursive.offset_save);
1403  RRETURN(rrc);
1404  }
1405 
1406  md->recursive = &new_recursive;
1407  memcpy(md->offset_vector, new_recursive.offset_save,
1408  new_recursive.saved_max * sizeof(int));
1409  callpat += GET(callpat, 1);
1410  }
1411  while (*callpat == OP_ALT);
1412 
1413  DPRINTF(("Recursion didn't match\n"));
1414  md->recursive = new_recursive.prevrec;
1415  if (new_recursive.offset_save != stacksave)
1416  (pcre_free)(new_recursive.offset_save);
1418  }
1419  /* Control never reaches here */
1420 
1421  /* "Once" brackets are like assertion brackets except that after a match,
1422  the point in the subject string is not moved back. Thus there can never be
1423  a move back into the brackets. Friedl calls these "atomic" subpatterns.
1424  Check the alternative branches in turn - the matching won't pass the KET
1425  for this kind of subpattern. If any one branch matches, we carry on as at
1426  the end of a normal bracket, leaving the subject pointer, but resetting
1427  the start-of-match value in case it was changed by \K. */
1428 
1429  case OP_ONCE:
1430  prev = ecode;
1431  saved_eptr = eptr;
1432 
1433  do
1434  {
1435  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1436  if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1437  {
1438  mstart = md->start_match_ptr;
1439  break;
1440  }
1441  if (rrc != MATCH_NOMATCH &&
1442  (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1443  RRETURN(rrc);
1444  ecode += GET(ecode,1);
1445  }
1446  while (*ecode == OP_ALT);
1447 
1448  /* If hit the end of the group (which could be repeated), fail */
1449 
1450  if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1451 
1452  /* Continue as from after the assertion, updating the offsets high water
1453  mark, since extracts may have been taken. */
1454 
1455  do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1456 
1457  offset_top = md->end_offset_top;
1458  eptr = md->end_match_ptr;
1459 
1460  /* For a non-repeating ket, just continue at this level. This also
1461  happens for a repeating ket if no characters were matched in the group.
1462  This is the forcible breaking of infinite loops as implemented in Perl
1463  5.005. If there is an options reset, it will get obeyed in the normal
1464  course of events. */
1465 
1466  if (*ecode == OP_KET || eptr == saved_eptr)
1467  {
1468  ecode += 1+LINK_SIZE;
1469  break;
1470  }
1471 
1472  /* The repeating kets try the rest of the pattern or restart from the
1473  preceding bracket, in the appropriate order. The second "call" of match()
1474  uses tail recursion, to avoid using another stack frame. We need to reset
1475  any options that changed within the bracket before re-running it, so
1476  check the next opcode. */
1477 
1478  if (ecode[1+LINK_SIZE] == OP_OPT)
1479  {
1480  ims = (ims & ~PCRE_IMS) | ecode[4];
1481  DPRINTF(("ims set to %02lx at group repeat\n", ims));
1482  }
1483 
1484  if (*ecode == OP_KETRMIN)
1485  {
1486  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1487  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1488  ecode = prev;
1489  flags = 0;
1490  goto TAIL_RECURSE;
1491  }
1492  else /* OP_KETRMAX */
1493  {
1494  RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1495  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1496  ecode += 1 + LINK_SIZE;
1497  flags = 0;
1498  goto TAIL_RECURSE;
1499  }
1500  /* Control never gets here */
1501 
1502  /* An alternation is the end of a branch; scan along to find the end of the
1503  bracketed group and go to there. */
1504 
1505  case OP_ALT:
1506  do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1507  break;
1508 
1509  /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1510  indicating that it may occur zero times. It may repeat infinitely, or not
1511  at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1512  with fixed upper repeat limits are compiled as a number of copies, with the
1513  optional ones preceded by BRAZERO or BRAMINZERO. */
1514 
1515  case OP_BRAZERO:
1516  {
1517  next = ecode+1;
1518  RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1519  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1520  do next += GET(next,1); while (*next == OP_ALT);
1521  ecode = next + 1 + LINK_SIZE;
1522  }
1523  break;
1524 
1525  case OP_BRAMINZERO:
1526  {
1527  next = ecode+1;
1528  do next += GET(next, 1); while (*next == OP_ALT);
1529  RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1530  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1531  ecode++;
1532  }
1533  break;
1534 
1535  case OP_SKIPZERO:
1536  {
1537  next = ecode+1;
1538  do next += GET(next,1); while (*next == OP_ALT);
1539  ecode = next + 1 + LINK_SIZE;
1540  }
1541  break;
1542 
1543  /* End of a group, repeated or non-repeating. */
1544 
1545  case OP_KET:
1546  case OP_KETRMIN:
1547  case OP_KETRMAX:
1548  prev = ecode - GET(ecode, 1);
1549 
1550  /* If this was a group that remembered the subject start, in order to break
1551  infinite repeats of empty string matches, retrieve the subject start from
1552  the chain. Otherwise, set it NULL. */
1553 
1554  if (*prev >= OP_SBRA)
1555  {
1556  saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1557  eptrb = eptrb->epb_prev; /* Backup to previous group */
1558  }
1559  else saved_eptr = NULL;
1560 
1561  /* If we are at the end of an assertion group or an atomic group, stop
1562  matching and return MATCH_MATCH, but record the current high water mark for
1563  use by positive assertions. We also need to record the match start in case
1564  it was changed by \K. */
1565 
1566  if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1567  *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1568  *prev == OP_ONCE)
1569  {
1570  md->end_match_ptr = eptr; /* For ONCE */
1571  md->end_offset_top = offset_top;
1572  md->start_match_ptr = mstart;
1574  }
1575 
1576  /* For capturing groups we have to check the group number back at the start
1577  and if necessary complete handling an extraction by setting the offsets and
1578  bumping the high water mark. Note that whole-pattern recursion is coded as
1579  a recurse into group 0, so it won't be picked up here. Instead, we catch it
1580  when the OP_END is reached. Other recursion is handled here. */
1581 
1582  if (*prev == OP_CBRA || *prev == OP_SCBRA)
1583  {
1584  number = GET2(prev, 1+LINK_SIZE);
1585  offset = number << 1;
1586 
1587 #ifdef PCRE_DEBUG
1588  printf("end bracket %d", number);
1589  printf("\n");
1590 #endif
1591 
1592  md->capture_last = number;
1593  if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1594  {
1595  md->offset_vector[offset] =
1596  md->offset_vector[md->offset_end - number];
1597  md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1598  if (offset_top <= offset) offset_top = offset + 2;
1599  }
1600 
1601  /* Handle a recursively called group. Restore the offsets
1602  appropriately and continue from after the call. */
1603 
1604  if (md->recursive != NULL && md->recursive->group_num == number)
1605  {
1606  recursion_info *rec = md->recursive;
1607  DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1608  md->recursive = rec->prevrec;
1609  memcpy(md->offset_vector, rec->offset_save,
1610  rec->saved_max * sizeof(int));
1611  offset_top = rec->save_offset_top;
1612  ecode = rec->after_call;
1613  ims = original_ims;
1614  break;
1615  }
1616  }
1617 
1618  /* For both capturing and non-capturing groups, reset the value of the ims
1619  flags, in case they got changed during the group. */
1620 
1621  ims = original_ims;
1622  DPRINTF(("ims reset to %02lx\n", ims));
1623 
1624  /* For a non-repeating ket, just continue at this level. This also
1625  happens for a repeating ket if no characters were matched in the group.
1626  This is the forcible breaking of infinite loops as implemented in Perl
1627  5.005. If there is an options reset, it will get obeyed in the normal
1628  course of events. */
1629 
1630  if (*ecode == OP_KET || eptr == saved_eptr)
1631  {
1632  ecode += 1 + LINK_SIZE;
1633  break;
1634  }
1635 
1636  /* The repeating kets try the rest of the pattern or restart from the
1637  preceding bracket, in the appropriate order. In the second case, we can use
1638  tail recursion to avoid using another stack frame, unless we have an
1639  unlimited repeat of a group that can match an empty string. */
1640 
1641  flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1642 
1643  if (*ecode == OP_KETRMIN)
1644  {
1645  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1646  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1647  if (flags != 0) /* Could match an empty string */
1648  {
1649  RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1650  RRETURN(rrc);
1651  }
1652  ecode = prev;
1653  goto TAIL_RECURSE;
1654  }
1655  else /* OP_KETRMAX */
1656  {
1657  RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1658  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1659  ecode += 1 + LINK_SIZE;
1660  flags = 0;
1661  goto TAIL_RECURSE;
1662  }
1663  /* Control never gets here */
1664 
1665  /* Start of subject unless notbol, or after internal newline if multiline */
1666 
1667  case OP_CIRC:
1668  if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1669  if ((ims & PCRE_MULTILINE) != 0)
1670  {
1671  if (eptr != md->start_subject &&
1672  (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1674  ecode++;
1675  break;
1676  }
1677  /* ... else fall through */
1678 
1679  /* Start of subject assertion */
1680 
1681  case OP_SOD:
1682  if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1683  ecode++;
1684  break;
1685 
1686  /* Start of match assertion */
1687 
1688  case OP_SOM:
1689  if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1690  ecode++;
1691  break;
1692 
1693  /* Reset the start of match point */
1694 
1695  case OP_SET_SOM:
1696  mstart = eptr;
1697  ecode++;
1698  break;
1699 
1700  /* Assert before internal newline if multiline, or before a terminating
1701  newline unless endonly is set, else end of subject unless noteol is set. */
1702 
1703  case OP_DOLL:
1704  if ((ims & PCRE_MULTILINE) != 0)
1705  {
1706  if (eptr < md->end_subject)
1707  { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1708  else
1709  {
1710  if (md->noteol) MRRETURN(MATCH_NOMATCH);
1711  SCHECK_PARTIAL();
1712  }
1713  ecode++;
1714  break;
1715  }
1716  else /* Not multiline */
1717  {
1718  if (md->noteol) MRRETURN(MATCH_NOMATCH);
1719  if (!md->endonly) goto ASSERT_NL_OR_EOS;
1720  }
1721 
1722  /* ... else fall through for endonly */
1723 
1724  /* End of subject assertion (\z) */
1725 
1726  case OP_EOD:
1727  if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1728  SCHECK_PARTIAL();
1729  ecode++;
1730  break;
1731 
1732  /* End of subject or ending \n assertion (\Z) */
1733 
1734  case OP_EODN:
1735  ASSERT_NL_OR_EOS:
1736  if (eptr < md->end_subject &&
1737  (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1739 
1740  /* Either at end of string or \n before end. */
1741 
1742  SCHECK_PARTIAL();
1743  ecode++;
1744  break;
1745 
1746  /* Word boundary assertions */
1747 
1748  case OP_NOT_WORD_BOUNDARY:
1749  case OP_WORD_BOUNDARY:
1750  {
1751 
1752  /* Find out if the previous and current characters are "word" characters.
1753  It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1754  be "non-word" characters. Remember the earliest consulted character for
1755  partial matching. */
1756 
1757 #ifdef SUPPORT_UTF8
1758  if (utf8)
1759  {
1760  /* Get status of previous character */
1761 
1762  if (eptr == md->start_subject) prev_is_word = FALSE; else
1763  {
1764  USPTR lastptr = eptr - 1;
1765  while((*lastptr & 0xc0) == 0x80) lastptr--;
1766  if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1767  GETCHAR(c, lastptr);
1768 #ifdef SUPPORT_UCP
1769  if (md->use_ucp)
1770  {
1771  if (c == '_') prev_is_word = TRUE; else
1772  {
1773  int cat = UCD_CATEGORY(c);
1774  prev_is_word = (cat == ucp_L || cat == ucp_N);
1775  }
1776  }
1777  else
1778 #endif
1779  prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1780  }
1781 
1782  /* Get status of next character */
1783 
1784  if (eptr >= md->end_subject)
1785  {
1786  SCHECK_PARTIAL();
1787  cur_is_word = FALSE;
1788  }
1789  else
1790  {
1791  GETCHAR(c, eptr);
1792 #ifdef SUPPORT_UCP
1793  if (md->use_ucp)
1794  {
1795  if (c == '_') cur_is_word = TRUE; else
1796  {
1797  int cat = UCD_CATEGORY(c);
1798  cur_is_word = (cat == ucp_L || cat == ucp_N);
1799  }
1800  }
1801  else
1802 #endif
1803  cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1804  }
1805  }
1806  else
1807 #endif
1808 
1809  /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1810  consistency with the behaviour of \w we do use it in this case. */
1811 
1812  {
1813  /* Get status of previous character */
1814 
1815  if (eptr == md->start_subject) prev_is_word = FALSE; else
1816  {
1817  if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1818 #ifdef SUPPORT_UCP
1819  if (md->use_ucp)
1820  {
1821  c = eptr[-1];
1822  if (c == '_') prev_is_word = TRUE; else
1823  {
1824  int cat = UCD_CATEGORY(c);
1825  prev_is_word = (cat == ucp_L || cat == ucp_N);
1826  }
1827  }
1828  else
1829 #endif
1830  prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1831  }
1832 
1833  /* Get status of next character */
1834 
1835  if (eptr >= md->end_subject)
1836  {
1837  SCHECK_PARTIAL();
1838  cur_is_word = FALSE;
1839  }
1840  else
1841 #ifdef SUPPORT_UCP
1842  if (md->use_ucp)
1843  {
1844  c = *eptr;
1845  if (c == '_') cur_is_word = TRUE; else
1846  {
1847  int cat = UCD_CATEGORY(c);
1848  cur_is_word = (cat == ucp_L || cat == ucp_N);
1849  }
1850  }
1851  else
1852 #endif
1853  cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1854  }
1855 
1856  /* Now see if the situation is what we want */
1857 
1858  if ((*ecode++ == OP_WORD_BOUNDARY)?
1859  cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1861  }
1862  break;
1863 
1864  /* Match a single character type; inline for speed */
1865 
1866  case OP_ANY:
1867  if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1868  /* Fall through */
1869 
1870  case OP_ALLANY:
1871  if (eptr++ >= md->end_subject)
1872  {
1873  SCHECK_PARTIAL();
1875  }
1876  if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1877  ecode++;
1878  break;
1879 
1880  /* Match a single byte, even in UTF-8 mode. This opcode really does match
1881  any byte, even newline, independent of the setting of PCRE_DOTALL. */
1882 
1883  case OP_ANYBYTE:
1884  if (eptr++ >= md->end_subject)
1885  {
1886  SCHECK_PARTIAL();
1888  }
1889  ecode++;
1890  break;
1891 
1892  case OP_NOT_DIGIT:
1893  if (eptr >= md->end_subject)
1894  {
1895  SCHECK_PARTIAL();
1897  }
1898  GETCHARINCTEST(c, eptr);
1899  if (
1900 #ifdef SUPPORT_UTF8
1901  c < 256 &&
1902 #endif
1903  (md->ctypes[c] & ctype_digit) != 0
1904  )
1906  ecode++;
1907  break;
1908 
1909  case OP_DIGIT:
1910  if (eptr >= md->end_subject)
1911  {
1912  SCHECK_PARTIAL();
1914  }
1915  GETCHARINCTEST(c, eptr);
1916  if (
1917 #ifdef SUPPORT_UTF8
1918  c >= 256 ||
1919 #endif
1920  (md->ctypes[c] & ctype_digit) == 0
1921  )
1923  ecode++;
1924  break;
1925 
1926  case OP_NOT_WHITESPACE:
1927  if (eptr >= md->end_subject)
1928  {
1929  SCHECK_PARTIAL();
1931  }
1932  GETCHARINCTEST(c, eptr);
1933  if (
1934 #ifdef SUPPORT_UTF8
1935  c < 256 &&
1936 #endif
1937  (md->ctypes[c] & ctype_space) != 0
1938  )
1940  ecode++;
1941  break;
1942 
1943  case OP_WHITESPACE:
1944  if (eptr >= md->end_subject)
1945  {
1946  SCHECK_PARTIAL();
1948  }
1949  GETCHARINCTEST(c, eptr);
1950  if (
1951 #ifdef SUPPORT_UTF8
1952  c >= 256 ||
1953 #endif
1954  (md->ctypes[c] & ctype_space) == 0
1955  )
1957  ecode++;
1958  break;
1959 
1960  case OP_NOT_WORDCHAR:
1961  if (eptr >= md->end_subject)
1962  {
1963  SCHECK_PARTIAL();
1965  }
1966  GETCHARINCTEST(c, eptr);
1967  if (
1968 #ifdef SUPPORT_UTF8
1969  c < 256 &&
1970 #endif
1971  (md->ctypes[c] & ctype_word) != 0
1972  )
1974  ecode++;
1975  break;
1976 
1977  case OP_WORDCHAR:
1978  if (eptr >= md->end_subject)
1979  {
1980  SCHECK_PARTIAL();
1982  }
1983  GETCHARINCTEST(c, eptr);
1984  if (
1985 #ifdef SUPPORT_UTF8
1986  c >= 256 ||
1987 #endif
1988  (md->ctypes[c] & ctype_word) == 0
1989  )
1991  ecode++;
1992  break;
1993 
1994  case OP_ANYNL:
1995  if (eptr >= md->end_subject)
1996  {
1997  SCHECK_PARTIAL();
1999  }
2000  GETCHARINCTEST(c, eptr);
2001  switch(c)
2002  {
2003  default: MRRETURN(MATCH_NOMATCH);
2004  case 0x000d:
2005  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2006  break;
2007 
2008  case 0x000a:
2009  break;
2010 
2011  case 0x000b:
2012  case 0x000c:
2013  case 0x0085:
2014  case 0x2028:
2015  case 0x2029:
2017  break;
2018  }
2019  ecode++;
2020  break;
2021 
2022  case OP_NOT_HSPACE:
2023  if (eptr >= md->end_subject)
2024  {
2025  SCHECK_PARTIAL();
2027  }
2028  GETCHARINCTEST(c, eptr);
2029  switch(c)
2030  {
2031  default: break;
2032  case 0x09: /* HT */
2033  case 0x20: /* SPACE */
2034  case 0xa0: /* NBSP */
2035  case 0x1680: /* OGHAM SPACE MARK */
2036  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2037  case 0x2000: /* EN QUAD */
2038  case 0x2001: /* EM QUAD */
2039  case 0x2002: /* EN SPACE */
2040  case 0x2003: /* EM SPACE */
2041  case 0x2004: /* THREE-PER-EM SPACE */
2042  case 0x2005: /* FOUR-PER-EM SPACE */
2043  case 0x2006: /* SIX-PER-EM SPACE */
2044  case 0x2007: /* FIGURE SPACE */
2045  case 0x2008: /* PUNCTUATION SPACE */
2046  case 0x2009: /* THIN SPACE */
2047  case 0x200A: /* HAIR SPACE */
2048  case 0x202f: /* NARROW NO-BREAK SPACE */
2049  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2050  case 0x3000: /* IDEOGRAPHIC SPACE */
2052  }
2053  ecode++;
2054  break;
2055 
2056  case OP_HSPACE:
2057  if (eptr >= md->end_subject)
2058  {
2059  SCHECK_PARTIAL();
2061  }
2062  GETCHARINCTEST(c, eptr);
2063  switch(c)
2064  {
2065  default: MRRETURN(MATCH_NOMATCH);
2066  case 0x09: /* HT */
2067  case 0x20: /* SPACE */
2068  case 0xa0: /* NBSP */
2069  case 0x1680: /* OGHAM SPACE MARK */
2070  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2071  case 0x2000: /* EN QUAD */
2072  case 0x2001: /* EM QUAD */
2073  case 0x2002: /* EN SPACE */
2074  case 0x2003: /* EM SPACE */
2075  case 0x2004: /* THREE-PER-EM SPACE */
2076  case 0x2005: /* FOUR-PER-EM SPACE */
2077  case 0x2006: /* SIX-PER-EM SPACE */
2078  case 0x2007: /* FIGURE SPACE */
2079  case 0x2008: /* PUNCTUATION SPACE */
2080  case 0x2009: /* THIN SPACE */
2081  case 0x200A: /* HAIR SPACE */
2082  case 0x202f: /* NARROW NO-BREAK SPACE */
2083  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2084  case 0x3000: /* IDEOGRAPHIC SPACE */
2085  break;
2086  }
2087  ecode++;
2088  break;
2089 
2090  case OP_NOT_VSPACE:
2091  if (eptr >= md->end_subject)
2092  {
2093  SCHECK_PARTIAL();
2095  }
2096  GETCHARINCTEST(c, eptr);
2097  switch(c)
2098  {
2099  default: break;
2100  case 0x0a: /* LF */
2101  case 0x0b: /* VT */
2102  case 0x0c: /* FF */
2103  case 0x0d: /* CR */
2104  case 0x85: /* NEL */
2105  case 0x2028: /* LINE SEPARATOR */
2106  case 0x2029: /* PARAGRAPH SEPARATOR */
2108  }
2109  ecode++;
2110  break;
2111 
2112  case OP_VSPACE:
2113  if (eptr >= md->end_subject)
2114  {
2115  SCHECK_PARTIAL();
2117  }
2118  GETCHARINCTEST(c, eptr);
2119  switch(c)
2120  {
2121  default: MRRETURN(MATCH_NOMATCH);
2122  case 0x0a: /* LF */
2123  case 0x0b: /* VT */
2124  case 0x0c: /* FF */
2125  case 0x0d: /* CR */
2126  case 0x85: /* NEL */
2127  case 0x2028: /* LINE SEPARATOR */
2128  case 0x2029: /* PARAGRAPH SEPARATOR */
2129  break;
2130  }
2131  ecode++;
2132  break;
2133 
2134 #ifdef SUPPORT_UCP
2135  /* Check the next character by Unicode property. We will get here only
2136  if the support is in the binary; otherwise a compile-time error occurs. */
2137 
2138  case OP_PROP:
2139  case OP_NOTPROP:
2140  if (eptr >= md->end_subject)
2141  {
2142  SCHECK_PARTIAL();
2144  }
2145  GETCHARINCTEST(c, eptr);
2146  {
2147  const ucd_record *prop = GET_UCD(c);
2148 
2149  switch(ecode[1])
2150  {
2151  case PT_ANY:
2152  if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2153  break;
2154 
2155  case PT_LAMP:
2156  if ((prop->chartype == ucp_Lu ||
2157  prop->chartype == ucp_Ll ||
2158  prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2160  break;
2161 
2162  case PT_GC:
2163  if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2165  break;
2166 
2167  case PT_PC:
2168  if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2170  break;
2171 
2172  case PT_SC:
2173  if ((ecode[2] != prop->script) == (op == OP_PROP))
2175  break;
2176 
2177  /* These are specials */
2178 
2179  case PT_ALNUM:
2180  if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2181  _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2183  break;
2184 
2185  case PT_SPACE: /* Perl space */
2186  if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2187  c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2188  == (op == OP_NOTPROP))
2190  break;
2191 
2192  case PT_PXSPACE: /* POSIX space */
2193  if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2194  c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2195  c == CHAR_FF || c == CHAR_CR)
2196  == (op == OP_NOTPROP))
2198  break;
2199 
2200  case PT_WORD:
2201  if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2202  _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2203  c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2205  break;
2206 
2207  /* This should never occur */
2208 
2209  default:
2211  }
2212 
2213  ecode += 3;
2214  }
2215  break;
2216 
2217  /* Match an extended Unicode sequence. We will get here only if the support
2218  is in the binary; otherwise a compile-time error occurs. */
2219 
2220  case OP_EXTUNI:
2221  if (eptr >= md->end_subject)
2222  {
2223  SCHECK_PARTIAL();
2225  }
2226  GETCHARINCTEST(c, eptr);
2227  {
2228  int category = UCD_CATEGORY(c);
2229  if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2230  while (eptr < md->end_subject)
2231  {
2232  int len = 1;
2233  if (!utf8) c = *eptr; else
2234  {
2235  GETCHARLEN(c, eptr, len);
2236  }
2237  category = UCD_CATEGORY(c);
2238  if (category != ucp_M) break;
2239  eptr += len;
2240  }
2241  }
2242  ecode++;
2243  break;
2244 #endif
2245 
2246 
2247  /* Match a back reference, possibly repeatedly. Look past the end of the
2248  item to see if there is repeat information following. The code is similar
2249  to that for character classes, but repeated for efficiency. Then obey
2250  similar code to character type repeats - written out again for speed.
2251  However, if the referenced string is the empty string, always treat
2252  it as matched, any number of times (otherwise there could be infinite
2253  loops). */
2254 
2255  case OP_REF:
2256  {
2257  offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2258  ecode += 3;
2259 
2260  /* If the reference is unset, there are two possibilities:
2261 
2262  (a) In the default, Perl-compatible state, set the length to be longer
2263  than the amount of subject left; this ensures that every attempt at a
2264  match fails. We can't just fail here, because of the possibility of
2265  quantifiers with zero minima.
2266 
2267  (b) If the JavaScript compatibility flag is set, set the length to zero
2268  so that the back reference matches an empty string.
2269 
2270  Otherwise, set the length to the length of what was matched by the
2271  referenced subpattern. */
2272 
2273  if (offset >= offset_top || md->offset_vector[offset] < 0)
2274  length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2275  else
2276  length = md->offset_vector[offset+1] - md->offset_vector[offset];
2277 
2278  /* Set up for repetition, or handle the non-repeated case */
2279 
2280  switch (*ecode)
2281  {
2282  case OP_CRSTAR:
2283  case OP_CRMINSTAR:
2284  case OP_CRPLUS:
2285  case OP_CRMINPLUS:
2286  case OP_CRQUERY:
2287  case OP_CRMINQUERY:
2288  c = *ecode++ - OP_CRSTAR;
2289  minimize = (c & 1) != 0;
2290  min = rep_min[c]; /* Pick up values from tables; */
2291  max = rep_max[c]; /* zero for max => infinity */
2292  if (max == 0) max = INT_MAX;
2293  break;
2294 
2295  case OP_CRRANGE:
2296  case OP_CRMINRANGE:
2297  minimize = (*ecode == OP_CRMINRANGE);
2298  min = GET2(ecode, 1);
2299  max = GET2(ecode, 3);
2300  if (max == 0) max = INT_MAX;
2301  ecode += 5;
2302  break;
2303 
2304  default: /* No repeat follows */
2305  if (!match_ref(offset, eptr, length, md, ims))
2306  {
2307  CHECK_PARTIAL();
2309  }
2310  eptr += length;
2311  continue; /* With the main loop */
2312  }
2313 
2314  /* If the length of the reference is zero, just continue with the
2315  main loop. */
2316 
2317  if (length == 0) continue;
2318 
2319  /* First, ensure the minimum number of matches are present. We get back
2320  the length of the reference string explicitly rather than passing the
2321  address of eptr, so that eptr can be a register variable. */
2322 
2323  for (i = 1; i <= min; i++)
2324  {
2325  if (!match_ref(offset, eptr, length, md, ims))
2326  {
2327  CHECK_PARTIAL();
2329  }
2330  eptr += length;
2331  }
2332 
2333  /* If min = max, continue at the same level without recursion.
2334  They are not both allowed to be zero. */
2335 
2336  if (min == max) continue;
2337 
2338  /* If minimizing, keep trying and advancing the pointer */
2339 
2340  if (minimize)
2341  {
2342  for (fi = min;; fi++)
2343  {
2344  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2345  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2346  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2347  if (!match_ref(offset, eptr, length, md, ims))
2348  {
2349  CHECK_PARTIAL();
2351  }
2352  eptr += length;
2353  }
2354  /* Control never gets here */
2355  }
2356 
2357  /* If maximizing, find the longest string and work backwards */
2358 
2359  else
2360  {
2361  pp = eptr;
2362  for (i = min; i < max; i++)
2363  {
2364  if (!match_ref(offset, eptr, length, md, ims))
2365  {
2366  CHECK_PARTIAL();
2367  break;
2368  }
2369  eptr += length;
2370  }
2371  while (eptr >= pp)
2372  {
2373  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2374  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2375  eptr -= length;
2376  }
2378  }
2379  }
2380  /* Control never gets here */
2381 
2382  /* Match a bit-mapped character class, possibly repeatedly. This op code is
2383  used when all the characters in the class have values in the range 0-255,
2384  and either the matching is caseful, or the characters are in the range
2385  0-127 when UTF-8 processing is enabled. The only difference between
2386  OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2387  encountered.
2388 
2389  First, look past the end of the item to see if there is repeat information
2390  following. Then obey similar code to character type repeats - written out
2391  again for speed. */
2392 
2393  case OP_NCLASS:
2394  case OP_CLASS:
2395  {
2396  data = ecode + 1; /* Save for matching */
2397  ecode += 33; /* Advance past the item */
2398 
2399  switch (*ecode)
2400  {
2401  case OP_CRSTAR:
2402  case OP_CRMINSTAR:
2403  case OP_CRPLUS:
2404  case OP_CRMINPLUS:
2405  case OP_CRQUERY:
2406  case OP_CRMINQUERY:
2407  c = *ecode++ - OP_CRSTAR;
2408  minimize = (c & 1) != 0;
2409  min = rep_min[c]; /* Pick up values from tables; */
2410  max = rep_max[c]; /* zero for max => infinity */
2411  if (max == 0) max = INT_MAX;
2412  break;
2413 
2414  case OP_CRRANGE:
2415  case OP_CRMINRANGE:
2416  minimize = (*ecode == OP_CRMINRANGE);
2417  min = GET2(ecode, 1);
2418  max = GET2(ecode, 3);
2419  if (max == 0) max = INT_MAX;
2420  ecode += 5;
2421  break;
2422 
2423  default: /* No repeat follows */
2424  min = max = 1;
2425  break;
2426  }
2427 
2428  /* First, ensure the minimum number of matches are present. */
2429 
2430 #ifdef SUPPORT_UTF8
2431  /* UTF-8 mode */
2432  if (utf8)
2433  {
2434  for (i = 1; i <= min; i++)
2435  {
2436  if (eptr >= md->end_subject)
2437  {
2438  SCHECK_PARTIAL();
2440  }
2441  GETCHARINC(c, eptr);
2442  if (c > 255)
2443  {
2444  if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2445  }
2446  else
2447  {
2448  if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2449  }
2450  }
2451  }
2452  else
2453 #endif
2454  /* Not UTF-8 mode */
2455  {
2456  for (i = 1; i <= min; i++)
2457  {
2458  if (eptr >= md->end_subject)
2459  {
2460  SCHECK_PARTIAL();
2462  }
2463  c = *eptr++;
2464  if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2465  }
2466  }
2467 
2468  /* If max == min we can continue with the main loop without the
2469  need to recurse. */
2470 
2471  if (min == max) continue;
2472 
2473  /* If minimizing, keep testing the rest of the expression and advancing
2474  the pointer while it matches the class. */
2475 
2476  if (minimize)
2477  {
2478 #ifdef SUPPORT_UTF8
2479  /* UTF-8 mode */
2480  if (utf8)
2481  {
2482  for (fi = min;; fi++)
2483  {
2484  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2485  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2486  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2487  if (eptr >= md->end_subject)
2488  {
2489  SCHECK_PARTIAL();
2491  }
2492  GETCHARINC(c, eptr);
2493  if (c > 255)
2494  {
2495  if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2496  }
2497  else
2498  {
2499  if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2500  }
2501  }
2502  }
2503  else
2504 #endif
2505  /* Not UTF-8 mode */
2506  {
2507  for (fi = min;; fi++)
2508  {
2509  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2510  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2512  if (eptr >= md->end_subject)
2513  {
2514  SCHECK_PARTIAL();
2516  }
2517  c = *eptr++;
2518  if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2519  }
2520  }
2521  /* Control never gets here */
2522  }
2523 
2524  /* If maximizing, find the longest possible run, then work backwards. */
2525 
2526  else
2527  {
2528  pp = eptr;
2529 
2530 #ifdef SUPPORT_UTF8
2531  /* UTF-8 mode */
2532  if (utf8)
2533  {
2534  for (i = min; i < max; i++)
2535  {
2536  int len = 1;
2537  if (eptr >= md->end_subject)
2538  {
2539  SCHECK_PARTIAL();
2540  break;
2541  }
2542  GETCHARLEN(c, eptr, len);
2543  if (c > 255)
2544  {
2545  if (op == OP_CLASS) break;
2546  }
2547  else
2548  {
2549  if ((data[c/8] & (1 << (c&7))) == 0) break;
2550  }
2551  eptr += len;
2552  }
2553  for (;;)
2554  {
2555  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2556  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2557  if (eptr-- == pp) break; /* Stop if tried at original pos */
2558  BACKCHAR(eptr);
2559  }
2560  }
2561  else
2562 #endif
2563  /* Not UTF-8 mode */
2564  {
2565  for (i = min; i < max; i++)
2566  {
2567  if (eptr >= md->end_subject)
2568  {
2569  SCHECK_PARTIAL();
2570  break;
2571  }
2572  c = *eptr;
2573  if ((data[c/8] & (1 << (c&7))) == 0) break;
2574  eptr++;
2575  }
2576  while (eptr >= pp)
2577  {
2578  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2579  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2580  eptr--;
2581  }
2582  }
2583 
2585  }
2586  }
2587  /* Control never gets here */
2588 
2589 
2590  /* Match an extended character class. This opcode is encountered only
2591  when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2592  mode, because Unicode properties are supported in non-UTF-8 mode. */
2593 
2594 #ifdef SUPPORT_UTF8
2595  case OP_XCLASS:
2596  {
2597  data = ecode + 1 + LINK_SIZE; /* Save for matching */
2598  ecode += GET(ecode, 1); /* Advance past the item */
2599 
2600  switch (*ecode)
2601  {
2602  case OP_CRSTAR:
2603  case OP_CRMINSTAR:
2604  case OP_CRPLUS:
2605  case OP_CRMINPLUS:
2606  case OP_CRQUERY:
2607  case OP_CRMINQUERY:
2608  c = *ecode++ - OP_CRSTAR;
2609  minimize = (c & 1) != 0;
2610  min = rep_min[c]; /* Pick up values from tables; */
2611  max = rep_max[c]; /* zero for max => infinity */
2612  if (max == 0) max = INT_MAX;
2613  break;
2614 
2615  case OP_CRRANGE:
2616  case OP_CRMINRANGE:
2617  minimize = (*ecode == OP_CRMINRANGE);
2618  min = GET2(ecode, 1);
2619  max = GET2(ecode, 3);
2620  if (max == 0) max = INT_MAX;
2621  ecode += 5;
2622  break;
2623 
2624  default: /* No repeat follows */
2625  min = max = 1;
2626  break;
2627  }
2628 
2629  /* First, ensure the minimum number of matches are present. */
2630 
2631  for (i = 1; i <= min; i++)
2632  {
2633  if (eptr >= md->end_subject)
2634  {
2635  SCHECK_PARTIAL();
2637  }
2638  GETCHARINCTEST(c, eptr);
2639  if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2640  }
2641 
2642  /* If max == min we can continue with the main loop without the
2643  need to recurse. */
2644 
2645  if (min == max) continue;
2646 
2647  /* If minimizing, keep testing the rest of the expression and advancing
2648  the pointer while it matches the class. */
2649 
2650  if (minimize)
2651  {
2652  for (fi = min;; fi++)
2653  {
2654  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2655  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2657  if (eptr >= md->end_subject)
2658  {
2659  SCHECK_PARTIAL();
2661  }
2662  GETCHARINCTEST(c, eptr);
2663  if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2664  }
2665  /* Control never gets here */
2666  }
2667 
2668  /* If maximizing, find the longest possible run, then work backwards. */
2669 
2670  else
2671  {
2672  pp = eptr;
2673  for (i = min; i < max; i++)
2674  {
2675  int len = 1;
2676  if (eptr >= md->end_subject)
2677  {
2678  SCHECK_PARTIAL();
2679  break;
2680  }
2681  GETCHARLENTEST(c, eptr, len);
2682  if (!_pcre_xclass(c, data)) break;
2683  eptr += len;
2684  }
2685  for(;;)
2686  {
2687  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2688  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689  if (eptr-- == pp) break; /* Stop if tried at original pos */
2690  if (utf8) BACKCHAR(eptr);
2691  }
2693  }
2694 
2695  /* Control never gets here */
2696  }
2697 #endif /* End of XCLASS */
2698 
2699  /* Match a single character, casefully */
2700 
2701  case OP_CHAR:
2702 #ifdef SUPPORT_UTF8
2703  if (utf8)
2704  {
2705  length = 1;
2706  ecode++;
2707  GETCHARLEN(fc, ecode, length);
2708  if (length > md->end_subject - eptr)
2709  {
2710  CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2712  }
2713  while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2714  }
2715  else
2716 #endif
2717 
2718  /* Non-UTF-8 mode */
2719  {
2720  if (md->end_subject - eptr < 1)
2721  {
2722  SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2724  }
2725  if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2726  ecode += 2;
2727  }
2728  break;
2729 
2730  /* Match a single character, caselessly */
2731 
2732  case OP_CHARNC:
2733 #ifdef SUPPORT_UTF8
2734  if (utf8)
2735  {
2736  length = 1;
2737  ecode++;
2738  GETCHARLEN(fc, ecode, length);
2739 
2740  if (length > md->end_subject - eptr)
2741  {
2742  CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2744  }
2745 
2746  /* If the pattern character's value is < 128, we have only one byte, and
2747  can use the fast lookup table. */
2748 
2749  if (fc < 128)
2750  {
2751  if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2752  }
2753 
2754  /* Otherwise we must pick up the subject character */
2755 
2756  else
2757  {
2758  unsigned int dc;
2759  GETCHARINC(dc, eptr);
2760  ecode += length;
2761 
2762  /* If we have Unicode property support, we can use it to test the other
2763  case of the character, if there is one. */
2764 
2765  if (fc != dc)
2766  {
2767 #ifdef SUPPORT_UCP
2768  if (dc != UCD_OTHERCASE(fc))
2769 #endif
2771  }
2772  }
2773  }
2774  else
2775 #endif /* SUPPORT_UTF8 */
2776 
2777  /* Non-UTF-8 mode */
2778  {
2779  if (md->end_subject - eptr < 1)
2780  {
2781  SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2783  }
2784  if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2785  ecode += 2;
2786  }
2787  break;
2788 
2789  /* Match a single character repeatedly. */
2790 
2791  case OP_EXACT:
2792  min = max = GET2(ecode, 1);
2793  ecode += 3;
2794  goto REPEATCHAR;
2795 
2796  case OP_POSUPTO:
2797  possessive = TRUE;
2798  /* Fall through */
2799 
2800  case OP_UPTO:
2801  case OP_MINUPTO:
2802  min = 0;
2803  max = GET2(ecode, 1);
2804  minimize = *ecode == OP_MINUPTO;
2805  ecode += 3;
2806  goto REPEATCHAR;
2807 
2808  case OP_POSSTAR:
2809  possessive = TRUE;
2810  min = 0;
2811  max = INT_MAX;
2812  ecode++;
2813  goto REPEATCHAR;
2814 
2815  case OP_POSPLUS:
2816  possessive = TRUE;
2817  min = 1;
2818  max = INT_MAX;
2819  ecode++;
2820  goto REPEATCHAR;
2821 
2822  case OP_POSQUERY:
2823  possessive = TRUE;
2824  min = 0;
2825  max = 1;
2826  ecode++;
2827  goto REPEATCHAR;
2828 
2829  case OP_STAR:
2830  case OP_MINSTAR:
2831  case OP_PLUS:
2832  case OP_MINPLUS:
2833  case OP_QUERY:
2834  case OP_MINQUERY:
2835  c = *ecode++ - OP_STAR;
2836  minimize = (c & 1) != 0;
2837 
2838  min = rep_min[c]; /* Pick up values from tables; */
2839  max = rep_max[c]; /* zero for max => infinity */
2840  if (max == 0) max = INT_MAX;
2841 
2842  /* Common code for all repeated single-character matches. */
2843 
2844  REPEATCHAR:
2845 #ifdef SUPPORT_UTF8
2846  if (utf8)
2847  {
2848  length = 1;
2849  charptr = ecode;
2850  GETCHARLEN(fc, ecode, length);
2851  ecode += length;
2852 
2853  /* Handle multibyte character matching specially here. There is
2854  support for caseless matching if UCP support is present. */
2855 
2856  if (length > 1)
2857  {
2858 #ifdef SUPPORT_UCP
2859  unsigned int othercase;
2860  if ((ims & PCRE_CASELESS) != 0 &&
2861  (othercase = UCD_OTHERCASE(fc)) != fc)
2862  oclength = _pcre_ord2utf8(othercase, occhars);
2863  else oclength = 0;
2864 #endif /* SUPPORT_UCP */
2865 
2866  for (i = 1; i <= min; i++)
2867  {
2868  if (eptr <= md->end_subject - length &&
2869  memcmp(eptr, charptr, length) == 0) eptr += length;
2870 #ifdef SUPPORT_UCP
2871  else if (oclength > 0 &&
2872  eptr <= md->end_subject - oclength &&
2873  memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2874 #endif /* SUPPORT_UCP */
2875  else
2876  {
2877  CHECK_PARTIAL();
2879  }
2880  }
2881 
2882  if (min == max) continue;
2883 
2884  if (minimize)
2885  {
2886  for (fi = min;; fi++)
2887  {
2888  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2889  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2890  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2891  if (eptr <= md->end_subject - length &&
2892  memcmp(eptr, charptr, length) == 0) eptr += length;
2893 #ifdef SUPPORT_UCP
2894  else if (oclength > 0 &&
2895  eptr <= md->end_subject - oclength &&
2896  memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2897 #endif /* SUPPORT_UCP */
2898  else
2899  {
2900  CHECK_PARTIAL();
2902  }
2903  }
2904  /* Control never gets here */
2905  }
2906 
2907  else /* Maximize */
2908  {
2909  pp = eptr;
2910  for (i = min; i < max; i++)
2911  {
2912  if (eptr <= md->end_subject - length &&
2913  memcmp(eptr, charptr, length) == 0) eptr += length;
2914 #ifdef SUPPORT_UCP
2915  else if (oclength > 0 &&
2916  eptr <= md->end_subject - oclength &&
2917  memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2918 #endif /* SUPPORT_UCP */
2919  else
2920  {
2921  CHECK_PARTIAL();
2922  break;
2923  }
2924  }
2925 
2926  if (possessive) continue;
2927 
2928  for(;;)
2929  {
2930  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2931  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2932  if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2933 #ifdef SUPPORT_UCP
2934  eptr--;
2935  BACKCHAR(eptr);
2936 #else /* without SUPPORT_UCP */
2937  eptr -= length;
2938 #endif /* SUPPORT_UCP */
2939  }
2940  }
2941  /* Control never gets here */
2942  }
2943 
2944  /* If the length of a UTF-8 character is 1, we fall through here, and
2945  obey the code as for non-UTF-8 characters below, though in this case the
2946  value of fc will always be < 128. */
2947  }
2948  else
2949 #endif /* SUPPORT_UTF8 */
2950 
2951  /* When not in UTF-8 mode, load a single-byte character. */
2952 
2953  fc = *ecode++;
2954 
2955  /* The value of fc at this point is always less than 256, though we may or
2956  may not be in UTF-8 mode. The code is duplicated for the caseless and
2957  caseful cases, for speed, since matching characters is likely to be quite
2958  common. First, ensure the minimum number of matches are present. If min =
2959  max, continue at the same level without recursing. Otherwise, if
2960  minimizing, keep trying the rest of the expression and advancing one
2961  matching character if failing, up to the maximum. Alternatively, if
2962  maximizing, find the maximum number of characters and work backwards. */
2963 
2964  DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2965  max, eptr));
2966 
2967  if ((ims & PCRE_CASELESS) != 0)
2968  {
2969  fc = md->lcc[fc];
2970  for (i = 1; i <= min; i++)
2971  {
2972  if (eptr >= md->end_subject)
2973  {
2974  SCHECK_PARTIAL();
2976  }
2977  if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2978  }
2979  if (min == max) continue;
2980  if (minimize)
2981  {
2982  for (fi = min;; fi++)
2983  {
2984  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2985  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2986  if (fi >= max) MRRETURN(MATCH_NOMATCH);
2987  if (eptr >= md->end_subject)
2988  {
2989  SCHECK_PARTIAL();
2991  }
2992  if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2993  }
2994  /* Control never gets here */
2995  }
2996  else /* Maximize */
2997  {
2998  pp = eptr;
2999  for (i = min; i < max; i++)
3000  {
3001  if (eptr >= md->end_subject)
3002  {
3003  SCHECK_PARTIAL();
3004  break;
3005  }
3006  if (fc != md->lcc[*eptr]) break;
3007  eptr++;
3008  }
3009 
3010  if (possessive) continue;
3011 
3012  while (eptr >= pp)
3013  {
3014  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
3015  eptr--;
3016  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3017  }
3019  }
3020  /* Control never gets here */
3021  }
3022 
3023  /* Caseful comparisons (includes all multi-byte characters) */
3024 
3025  else
3026  {
3027  for (i = 1; i <= min; i++)
3028  {
3029  if (eptr >= md->end_subject)
3030  {
3031  SCHECK_PARTIAL();
3033  }
3034  if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3035  }
3036 
3037  if (min == max) continue;
3038 
3039  if (minimize)
3040  {
3041  for (fi = min;; fi++)
3042  {
3043  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3044  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045  if (fi >= max) MRRETURN(MATCH_NOMATCH);
3046  if (eptr >= md->end_subject)
3047  {
3048  SCHECK_PARTIAL();
3050  }
3051  if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3052  }
3053  /* Control never gets here */
3054  }
3055  else /* Maximize */
3056  {
3057  pp = eptr;
3058  for (i = min; i < max; i++)
3059  {
3060  if (eptr >= md->end_subject)
3061  {
3062  SCHECK_PARTIAL();
3063  break;
3064  }
3065  if (fc != *eptr) break;
3066  eptr++;
3067  }
3068  if (possessive) continue;
3069 
3070  while (eptr >= pp)
3071  {
3072  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3073  eptr--;
3074  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3075  }
3077  }
3078  }
3079  /* Control never gets here */
3080 
3081  /* Match a negated single one-byte character. The character we are
3082  checking can be multibyte. */
3083 
3084  case OP_NOT:
3085  if (eptr >= md->end_subject)
3086  {
3087  SCHECK_PARTIAL();
3089  }
3090  ecode++;
3091  GETCHARINCTEST(c, eptr);
3092  if ((ims & PCRE_CASELESS) != 0)
3093  {
3094 #ifdef SUPPORT_UTF8
3095  if (c < 256)
3096 #endif
3097  c = md->lcc[c];
3098  if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3099  }
3100  else
3101  {
3102  if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3103  }
3104  break;
3105 
3106  /* Match a negated single one-byte character repeatedly. This is almost a
3107  repeat of the code for a repeated single character, but I haven't found a
3108  nice way of commoning these up that doesn't require a test of the
3109  positive/negative option for each character match. Maybe that wouldn't add
3110  very much to the time taken, but character matching *is* what this is all
3111  about... */
3112 
3113  case OP_NOTEXACT:
3114  min = max = GET2(ecode, 1);
3115  ecode += 3;
3116  goto REPEATNOTCHAR;
3117 
3118  case OP_NOTUPTO:
3119  case OP_NOTMINUPTO:
3120  min = 0;
3121  max = GET2(ecode, 1);
3122  minimize = *ecode == OP_NOTMINUPTO;
3123  ecode += 3;
3124  goto REPEATNOTCHAR;
3125 
3126  case OP_NOTPOSSTAR:
3127  possessive = TRUE;
3128  min = 0;
3129  max = INT_MAX;
3130  ecode++;
3131  goto REPEATNOTCHAR;
3132 
3133  case OP_NOTPOSPLUS:
3134  possessive = TRUE;
3135  min = 1;
3136  max = INT_MAX;
3137  ecode++;
3138  goto REPEATNOTCHAR;
3139 
3140  case OP_NOTPOSQUERY:
3141  possessive = TRUE;
3142  min = 0;
3143  max = 1;
3144  ecode++;
3145  goto REPEATNOTCHAR;
3146 
3147  case OP_NOTPOSUPTO:
3148  possessive = TRUE;
3149  min = 0;
3150  max = GET2(ecode, 1);
3151  ecode += 3;
3152  goto REPEATNOTCHAR;
3153 
3154  case OP_NOTSTAR:
3155  case OP_NOTMINSTAR:
3156  case OP_NOTPLUS:
3157  case OP_NOTMINPLUS:
3158  case OP_NOTQUERY:
3159  case OP_NOTMINQUERY:
3160  c = *ecode++ - OP_NOTSTAR;
3161  minimize = (c & 1) != 0;
3162  min = rep_min[c]; /* Pick up values from tables; */
3163  max = rep_max[c]; /* zero for max => infinity */
3164  if (max == 0) max = INT_MAX;
3165 
3166  /* Common code for all repeated single-byte matches. */
3167 
3168  REPEATNOTCHAR:
3169  fc = *ecode++;
3170 
3171  /* The code is duplicated for the caseless and caseful cases, for speed,
3172  since matching characters is likely to be quite common. First, ensure the
3173  minimum number of matches are present. If min = max, continue at the same
3174  level without recursing. Otherwise, if minimizing, keep trying the rest of
3175  the expression and advancing one matching character if failing, up to the
3176  maximum. Alternatively, if maximizing, find the maximum number of
3177  characters and work backwards. */
3178 
3179  DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3180  max, eptr));
3181 
3182  if ((ims & PCRE_CASELESS) != 0)
3183  {
3184  fc = md->lcc[fc];
3185 
3186 #ifdef SUPPORT_UTF8
3187  /* UTF-8 mode */
3188  if (utf8)
3189  {
3190  register unsigned int d;
3191  for (i = 1; i <= min; i++)
3192  {
3193  if (eptr >= md->end_subject)
3194  {
3195  SCHECK_PARTIAL();
3197  }
3198  GETCHARINC(d, eptr);
3199  if (d < 256) d = md->lcc[d];
3200  if (fc == d) MRRETURN(MATCH_NOMATCH);
3201  }
3202  }
3203  else
3204 #endif
3205 
3206  /* Not UTF-8 mode */
3207  {
3208  for (i = 1; i <= min; i++)
3209  {
3210  if (eptr >= md->end_subject)
3211  {
3212  SCHECK_PARTIAL();
3214  }
3215  if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3216  }
3217  }
3218 
3219  if (min == max) continue;
3220 
3221  if (minimize)
3222  {
3223 #ifdef SUPPORT_UTF8
3224  /* UTF-8 mode */
3225  if (utf8)
3226  {
3227  register unsigned int d;
3228  for (fi = min;; fi++)
3229  {
3230  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3231  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3232  if (fi >= max) MRRETURN(MATCH_NOMATCH);
3233  if (eptr >= md->end_subject)
3234  {
3235  SCHECK_PARTIAL();
3237  }
3238  GETCHARINC(d, eptr);
3239  if (d < 256) d = md->lcc[d];
3240  if (fc == d) MRRETURN(MATCH_NOMATCH);
3241  }
3242  }
3243  else
3244 #endif
3245  /* Not UTF-8 mode */
3246  {
3247  for (fi = min;; fi++)
3248  {
3249  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3250  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3251  if (fi >= max) MRRETURN(MATCH_NOMATCH);
3252  if (eptr >= md->end_subject)
3253  {
3254  SCHECK_PARTIAL();
3256  }
3257  if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3258  }
3259  }
3260  /* Control never gets here */
3261  }
3262 
3263  /* Maximize case */
3264 
3265  else
3266  {
3267  pp = eptr;
3268 
3269 #ifdef SUPPORT_UTF8
3270  /* UTF-8 mode */
3271  if (utf8)
3272  {
3273  register unsigned int d;
3274  for (i = min; i < max; i++)
3275  {
3276  int len = 1;
3277  if (eptr >= md->end_subject)
3278  {
3279  SCHECK_PARTIAL();
3280  break;
3281  }
3282  GETCHARLEN(d, eptr, len);
3283  if (d < 256) d = md->lcc[d];
3284  if (fc == d) break;
3285  eptr += len;
3286  }
3287  if (possessive) continue;
3288  for(;;)
3289  {
3290  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3291  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3292  if (eptr-- == pp) break; /* Stop if tried at original pos */
3293  BACKCHAR(eptr);
3294  }
3295  }
3296  else
3297 #endif
3298  /* Not UTF-8 mode */
3299  {
3300  for (i = min; i < max; i++)
3301  {
3302  if (eptr >= md->end_subject)
3303  {
3304  SCHECK_PARTIAL();
3305  break;
3306  }
3307  if (fc == md->lcc[*eptr]) break;
3308  eptr++;
3309  }
3310  if (possessive) continue;
3311  while (eptr >= pp)
3312  {
3313  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3314  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315  eptr--;
3316  }
3317  }
3318 
3320  }
3321  /* Control never gets here */
3322  }
3323 
3324  /* Caseful comparisons */
3325 
3326  else
3327  {
3328 #ifdef SUPPORT_UTF8
3329  /* UTF-8 mode */
3330  if (utf8)
3331  {
3332  register unsigned int d;
3333  for (i = 1; i <= min; i++)
3334  {
3335  if (eptr >= md->end_subject)
3336  {
3337  SCHECK_PARTIAL();
3339  }
3340  GETCHARINC(d, eptr);
3341  if (fc == d) MRRETURN(MATCH_NOMATCH);
3342  }
3343  }
3344  else
3345 #endif
3346  /* Not UTF-8 mode */
3347  {
3348  for (i = 1; i <= min; i++)
3349  {
3350  if (eptr >= md->end_subject)
3351  {
3352  SCHECK_PARTIAL();
3354  }
3355  if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3356  }
3357  }
3358 
3359  if (min == max) continue;
3360 
3361  if (minimize)
3362  {
3363 #ifdef SUPPORT_UTF8
3364  /* UTF-8 mode */
3365  if (utf8)
3366  {
3367  register unsigned int d;
3368  for (fi = min;; fi++)
3369  {
3370  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3371  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3372  if (fi >= max) MRRETURN(MATCH_NOMATCH);
3373  if (eptr >= md->end_subject)
3374  {
3375  SCHECK_PARTIAL();
3377  }
3378  GETCHARINC(d, eptr);
3379  if (fc == d) MRRETURN(MATCH_NOMATCH);
3380  }
3381  }
3382  else
3383 #endif
3384  /* Not UTF-8 mode */
3385  {
3386  for (fi = min;; fi++)
3387  {
3388  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3389  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3390  if (fi >= max) MRRETURN(MATCH_NOMATCH);
3391  if (eptr >= md->end_subject)
3392  {
3393  SCHECK_PARTIAL();
3395  }
3396  if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3397  }
3398  }
3399  /* Control never gets here */
3400  }
3401 
3402  /* Maximize case */
3403 
3404  else
3405  {
3406  pp = eptr;
3407 
3408 #ifdef SUPPORT_UTF8
3409  /* UTF-8 mode */
3410  if (utf8)
3411  {
3412  register unsigned int d;
3413  for (i = min; i < max; i++)
3414  {
3415  int len = 1;
3416  if (eptr >= md->end_subject)
3417  {
3418  SCHECK_PARTIAL();
3419  break;
3420  }
3421  GETCHARLEN(d, eptr, len);
3422  if (fc == d) break;
3423  eptr += len;
3424  }
3425  if (possessive) continue;
3426  for(;;)
3427  {
3428  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3429  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3430  if (eptr-- == pp) break; /* Stop if tried at original pos */
3431  BACKCHAR(eptr);
3432  }
3433  }
3434  else
3435 #endif
3436  /* Not UTF-8 mode */
3437  {
3438  for (i = min; i < max; i++)
3439  {
3440  if (eptr >= md->end_subject)
3441  {
3442  SCHECK_PARTIAL();
3443  break;
3444  }
3445  if (fc == *eptr) break;
3446  eptr++;
3447  }
3448  if (possessive) continue;
3449  while (eptr >= pp)
3450  {
3451  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3452  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453  eptr--;
3454  }
3455  }
3456 
3458  }
3459  }
3460  /* Control never gets here */
3461 
3462  /* Match a single character type repeatedly; several different opcodes
3463  share code. This is very similar to the code for single characters, but we
3464  repeat it in the interests of efficiency. */
3465 
3466  case OP_TYPEEXACT:
3467  min = max = GET2(ecode, 1);
3468  minimize = TRUE;
3469  ecode += 3;
3470  goto REPEATTYPE;
3471 
3472  case OP_TYPEUPTO:
3473  case OP_TYPEMINUPTO:
3474  min = 0;
3475  max = GET2(ecode, 1);
3476  minimize = *ecode == OP_TYPEMINUPTO;
3477  ecode += 3;
3478  goto REPEATTYPE;
3479 
3480  case OP_TYPEPOSSTAR:
3481  possessive = TRUE;
3482  min = 0;
3483  max = INT_MAX;
3484  ecode++;
3485  goto REPEATTYPE;
3486 
3487  case OP_TYPEPOSPLUS:
3488  possessive = TRUE;
3489  min = 1;
3490  max = INT_MAX;
3491  ecode++;
3492  goto REPEATTYPE;
3493 
3494  case OP_TYPEPOSQUERY:
3495  possessive = TRUE;
3496  min = 0;
3497  max = 1;
3498  ecode++;
3499  goto REPEATTYPE;
3500 
3501  case OP_TYPEPOSUPTO:
3502  possessive = TRUE;
3503  min = 0;
3504  max = GET2(ecode, 1);
3505  ecode += 3;
3506  goto REPEATTYPE;
3507 
3508  case OP_TYPESTAR:
3509  case OP_TYPEMINSTAR:
3510  case OP_TYPEPLUS:
3511  case OP_TYPEMINPLUS:
3512  case OP_TYPEQUERY:
3513  case OP_TYPEMINQUERY:
3514  c = *ecode++ - OP_TYPESTAR;
3515  minimize = (c & 1) != 0;
3516  min = rep_min[c]; /* Pick up values from tables; */
3517  max = rep_max[c]; /* zero for max => infinity */
3518  if (max == 0) max = INT_MAX;
3519 
3520  /* Common code for all repeated single character type matches. Note that
3521  in UTF-8 mode, '.' matches a character of any length, but for the other
3522  character types, the valid characters are all one-byte long. */
3523 
3524  REPEATTYPE:
3525  ctype = *ecode++; /* Code for the character type */
3526 
3527 #ifdef SUPPORT_UCP
3528  if (ctype == OP_PROP || ctype == OP_NOTPROP)
3529  {
3530  prop_fail_result = ctype == OP_NOTPROP;
3531  prop_type = *ecode++;
3532  prop_value = *ecode++;
3533  }
3534  else prop_type = -1;
3535 #endif
3536 
3537  /* First, ensure the minimum number of matches are present. Use inline
3538  code for maximizing the speed, and do the type test once at the start
3539  (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3540  is tidier. Also separate the UCP code, which can be the same for both UTF-8
3541  and single-bytes. */
3542 
3543  if (min > 0)
3544  {
3545 #ifdef SUPPORT_UCP
3546  if (prop_type >= 0)
3547  {
3548  switch(prop_type)
3549  {
3550  case PT_ANY:
3551  if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3552  for (i = 1; i <= min; i++)
3553  {
3554  if (eptr >= md->end_subject)
3555  {
3556  SCHECK_PARTIAL();
3558  }
3559  GETCHARINCTEST(c, eptr);
3560  }
3561  break;
3562 
3563  case PT_LAMP:
3564  for (i = 1; i <= min; i++)
3565  {
3566  if (eptr >= md->end_subject)
3567  {
3568  SCHECK_PARTIAL();
3570  }
3571  GETCHARINCTEST(c, eptr);
3572  prop_chartype = UCD_CHARTYPE(c);
3573  if ((prop_chartype == ucp_Lu ||
3574  prop_chartype == ucp_Ll ||
3575  prop_chartype == ucp_Lt) == prop_fail_result)
3577  }
3578  break;
3579 
3580  case PT_GC:
3581  for (i = 1; i <= min; i++)
3582  {
3583  if (eptr >= md->end_subject)
3584  {
3585  SCHECK_PARTIAL();
3587  }
3588  GETCHARINCTEST(c, eptr);
3589  prop_category = UCD_CATEGORY(c);
3590  if ((prop_category == prop_value) == prop_fail_result)
3592  }
3593  break;
3594 
3595  case PT_PC:
3596  for (i = 1; i <= min; i++)
3597  {
3598  if (eptr >= md->end_subject)
3599  {
3600  SCHECK_PARTIAL();
3602  }
3603  GETCHARINCTEST(c, eptr);
3604  prop_chartype = UCD_CHARTYPE(c);
3605  if ((prop_chartype == prop_value) == prop_fail_result)
3607  }
3608  break;
3609 
3610  case PT_SC:
3611  for (i = 1; i <= min; i++)
3612  {
3613  if (eptr >= md->end_subject)
3614  {
3615  SCHECK_PARTIAL();
3617  }
3618  GETCHARINCTEST(c, eptr);
3619  prop_script = UCD_SCRIPT(c);
3620  if ((prop_script == prop_value) == prop_fail_result)
3622  }
3623  break;
3624 
3625  case PT_ALNUM:
3626  for (i = 1; i <= min; i++)
3627  {
3628  if (eptr >= md->end_subject)
3629  {
3630  SCHECK_PARTIAL();
3632  }
3633  GETCHARINCTEST(c, eptr);
3634  prop_category = UCD_CATEGORY(c);
3635  if ((prop_category == ucp_L || prop_category == ucp_N)
3636  == prop_fail_result)
3638  }
3639  break;
3640 
3641  case PT_SPACE: /* Perl space */
3642  for (i = 1; i <= min; i++)
3643  {
3644  if (eptr >= md->end_subject)
3645  {
3646  SCHECK_PARTIAL();
3648  }
3649  GETCHARINCTEST(c, eptr);
3650  prop_category = UCD_CATEGORY(c);
3651  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3652  c == CHAR_FF || c == CHAR_CR)
3653  == prop_fail_result)
3655  }
3656  break;
3657 
3658  case PT_PXSPACE: /* POSIX space */
3659  for (i = 1; i <= min; i++)
3660  {
3661  if (eptr >= md->end_subject)
3662  {
3663  SCHECK_PARTIAL();
3665  }
3666  GETCHARINCTEST(c, eptr);
3667  prop_category = UCD_CATEGORY(c);
3668  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3669  c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3670  == prop_fail_result)
3672  }
3673  break;
3674 
3675  case PT_WORD:
3676  for (i = 1; i <= min; i++)
3677  {
3678  if (eptr >= md->end_subject)
3679  {
3680  SCHECK_PARTIAL();
3682  }
3683  GETCHARINCTEST(c, eptr);
3684  prop_category = UCD_CATEGORY(c);
3685  if ((prop_category == ucp_L || prop_category == ucp_N ||
3686  c == CHAR_UNDERSCORE)
3687  == prop_fail_result)
3689  }
3690  break;
3691 
3692  /* This should not occur */
3693 
3694  default:
3696  }
3697  }
3698 
3699  /* Match extended Unicode sequences. We will get here only if the
3700  support is in the binary; otherwise a compile-time error occurs. */
3701 
3702  else if (ctype == OP_EXTUNI)
3703  {
3704  for (i = 1; i <= min; i++)
3705  {
3706  if (eptr >= md->end_subject)
3707  {
3708  SCHECK_PARTIAL();
3710  }
3711  GETCHARINCTEST(c, eptr);
3712  prop_category = UCD_CATEGORY(c);
3713  if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3714  while (eptr < md->end_subject)
3715  {
3716  int len = 1;
3717  if (!utf8) c = *eptr;
3718  else { GETCHARLEN(c, eptr, len); }
3719  prop_category = UCD_CATEGORY(c);
3720  if (prop_category != ucp_M) break;
3721  eptr += len;
3722  }
3723  }
3724  }
3725 
3726  else
3727 #endif /* SUPPORT_UCP */
3728 
3729 /* Handle all other cases when the coding is UTF-8 */
3730 
3731 #ifdef SUPPORT_UTF8
3732  if (utf8) switch(ctype)
3733  {
3734  case OP_ANY:
3735  for (i = 1; i <= min; i++)
3736  {
3737  if (eptr >= md->end_subject)
3738  {
3739  SCHECK_PARTIAL();
3741  }
3742  if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3743  eptr++;
3744  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3745  }
3746  break;
3747 
3748  case OP_ALLANY:
3749  for (i = 1; i <= min; i++)
3750  {
3751  if (eptr >= md->end_subject)
3752  {
3753  SCHECK_PARTIAL();
3755  }
3756  eptr++;
3757  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3758  }
3759  break;
3760 
3761  case OP_ANYBYTE:
3762  if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3763  eptr += min;
3764  break;
3765 
3766  case OP_ANYNL:
3767  for (i = 1; i <= min; i++)
3768  {
3769  if (eptr >= md->end_subject)
3770  {
3771  SCHECK_PARTIAL();
3773  }
3774  GETCHARINC(c, eptr);
3775  switch(c)
3776  {
3777  default: MRRETURN(MATCH_NOMATCH);
3778  case 0x000d:
3779  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3780  break;
3781 
3782  case 0x000a:
3783  break;
3784 
3785  case 0x000b:
3786  case 0x000c:
3787  case 0x0085:
3788  case 0x2028:
3789  case 0x2029:
3791  break;
3792  }
3793  }
3794  break;
3795 
3796  case OP_NOT_HSPACE:
3797  for (i = 1; i <= min; i++)
3798  {
3799  if (eptr >= md->end_subject)
3800  {
3801  SCHECK_PARTIAL();
3803  }
3804  GETCHARINC(c, eptr);
3805  switch(c)
3806  {
3807  default: break;
3808  case 0x09: /* HT */
3809  case 0x20: /* SPACE */
3810  case 0xa0: /* NBSP */
3811  case 0x1680: /* OGHAM SPACE MARK */
3812  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3813  case 0x2000: /* EN QUAD */
3814  case 0x2001: /* EM QUAD */
3815  case 0x2002: /* EN SPACE */
3816  case 0x2003: /* EM SPACE */
3817  case 0x2004: /* THREE-PER-EM SPACE */
3818  case 0x2005: /* FOUR-PER-EM SPACE */
3819  case 0x2006: /* SIX-PER-EM SPACE */
3820  case 0x2007: /* FIGURE SPACE */
3821  case 0x2008: /* PUNCTUATION SPACE */
3822  case 0x2009: /* THIN SPACE */
3823  case 0x200A: /* HAIR SPACE */
3824  case 0x202f: /* NARROW NO-BREAK SPACE */
3825  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3826  case 0x3000: /* IDEOGRAPHIC SPACE */
3828  }
3829  }
3830  break;
3831 
3832  case OP_HSPACE:
3833  for (i = 1; i <= min; i++)
3834  {
3835  if (eptr >= md->end_subject)
3836  {
3837  SCHECK_PARTIAL();
3839  }
3840  GETCHARINC(c, eptr);
3841  switch(c)
3842  {
3843  default: MRRETURN(MATCH_NOMATCH);
3844  case 0x09: /* HT */
3845  case 0x20: /* SPACE */
3846  case 0xa0: /* NBSP */
3847  case 0x1680: /* OGHAM SPACE MARK */
3848  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3849  case 0x2000: /* EN QUAD */
3850  case 0x2001: /* EM QUAD */
3851  case 0x2002: /* EN SPACE */
3852  case 0x2003: /* EM SPACE */
3853  case 0x2004: /* THREE-PER-EM SPACE */
3854  case 0x2005: /* FOUR-PER-EM SPACE */
3855  case 0x2006: /* SIX-PER-EM SPACE */
3856  case 0x2007: /* FIGURE SPACE */
3857  case 0x2008: /* PUNCTUATION SPACE */
3858  case 0x2009: /* THIN SPACE */
3859  case 0x200A: /* HAIR SPACE */
3860  case 0x202f: /* NARROW NO-BREAK SPACE */
3861  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3862  case 0x3000: /* IDEOGRAPHIC SPACE */
3863  break;
3864  }
3865  }
3866  break;
3867 
3868  case OP_NOT_VSPACE:
3869  for (i = 1; i <= min; i++)
3870  {
3871  if (eptr >= md->end_subject)
3872  {
3873  SCHECK_PARTIAL();
3875  }
3876  GETCHARINC(c, eptr);
3877  switch(c)
3878  {
3879  default: break;
3880  case 0x0a: /* LF */
3881  case 0x0b: /* VT */
3882  case 0x0c: /* FF */
3883  case 0x0d: /* CR */
3884  case 0x85: /* NEL */
3885  case 0x2028: /* LINE SEPARATOR */
3886  case 0x2029: /* PARAGRAPH SEPARATOR */
3888  }
3889  }
3890  break;
3891 
3892  case OP_VSPACE:
3893  for (i = 1; i <= min; i++)
3894  {
3895  if (eptr >= md->end_subject)
3896  {
3897  SCHECK_PARTIAL();
3899  }
3900  GETCHARINC(c, eptr);
3901  switch(c)
3902  {
3903  default: MRRETURN(MATCH_NOMATCH);
3904  case 0x0a: /* LF */
3905  case 0x0b: /* VT */
3906  case 0x0c: /* FF */
3907  case 0x0d: /* CR */
3908  case 0x85: /* NEL */
3909  case 0x2028: /* LINE SEPARATOR */
3910  case 0x2029: /* PARAGRAPH SEPARATOR */
3911  break;
3912  }
3913  }
3914  break;
3915 
3916  case OP_NOT_DIGIT:
3917  for (i = 1; i <= min; i++)
3918  {
3919  if (eptr >= md->end_subject)
3920  {
3921  SCHECK_PARTIAL();
3923  }
3924  GETCHARINC(c, eptr);
3925  if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3927  }
3928  break;
3929 
3930  case OP_DIGIT:
3931  for (i = 1; i <= min; i++)
3932  {
3933  if (eptr >= md->end_subject)
3934  {
3935  SCHECK_PARTIAL();
3937  }
3938  if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3940  /* No need to skip more bytes - we know it's a 1-byte character */
3941  }
3942  break;
3943 
3944  case OP_NOT_WHITESPACE:
3945  for (i = 1; i <= min; i++)
3946  {
3947  if (eptr >= md->end_subject)
3948  {
3949  SCHECK_PARTIAL();
3951  }
3952  if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3954  while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3955  }
3956  break;
3957 
3958  case OP_WHITESPACE:
3959  for (i = 1; i <= min; i++)
3960  {
3961  if (eptr >= md->end_subject)
3962  {
3963  SCHECK_PARTIAL();
3965  }
3966  if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3968  /* No need to skip more bytes - we know it's a 1-byte character */
3969  }
3970  break;
3971 
3972  case OP_NOT_WORDCHAR:
3973  for (i = 1; i <= min; i++)
3974  {
3975  if (eptr >= md->end_subject)
3976  {
3977  SCHECK_PARTIAL();
3979  }
3980  if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3982  while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3983  }
3984  break;
3985 
3986  case OP_WORDCHAR:
3987  for (i = 1; i <= min; i++)
3988  {
3989  if (eptr >= md->end_subject)
3990  {
3991  SCHECK_PARTIAL();
3993  }
3994  if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3996  /* No need to skip more bytes - we know it's a 1-byte character */
3997  }
3998  break;
3999 
4000  default:
4002  } /* End switch(ctype) */
4003 
4004  else
4005 #endif /* SUPPORT_UTF8 */
4006 
4007  /* Code for the non-UTF-8 case for minimum matching of operators other
4008  than OP_PROP and OP_NOTPROP. */
4009 
4010  switch(ctype)
4011  {
4012  case OP_ANY:
4013  for (i = 1; i <= min; i++)
4014  {
4015  if (eptr >= md->end_subject)
4016  {
4017  SCHECK_PARTIAL();
4019  }
4020  if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4021  eptr++;
4022  }
4023  break;
4024 
4025  case OP_ALLANY:
4026  if (eptr > md->end_subject - min)
4027  {
4028  SCHECK_PARTIAL();
4030  }
4031  eptr += min;
4032  break;
4033 
4034  case OP_ANYBYTE:
4035  if (eptr > md->end_subject - min)
4036  {
4037  SCHECK_PARTIAL();
4039  }
4040  eptr += min;
4041  break;
4042 
4043  case OP_ANYNL:
4044  for (i = 1; i <= min; i++)
4045  {
4046  if (eptr >= md->end_subject)
4047  {
4048  SCHECK_PARTIAL();
4050  }
4051  switch(*eptr++)
4052  {
4053  default: MRRETURN(MATCH_NOMATCH);
4054  case 0x000d:
4055  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4056  break;
4057  case 0x000a:
4058  break;
4059 
4060  case 0x000b:
4061  case 0x000c:
4062  case 0x0085:
4064  break;
4065  }
4066  }
4067  break;
4068 
4069  case OP_NOT_HSPACE:
4070  for (i = 1; i <= min; i++)
4071  {
4072  if (eptr >= md->end_subject)
4073  {
4074  SCHECK_PARTIAL();
4076  }
4077  switch(*eptr++)
4078  {
4079  default: break;
4080  case 0x09: /* HT */
4081  case 0x20: /* SPACE */
4082  case 0xa0: /* NBSP */
4084  }
4085  }
4086  break;
4087 
4088  case OP_HSPACE:
4089  for (i = 1; i <= min; i++)
4090  {
4091  if (eptr >= md->end_subject)
4092  {
4093  SCHECK_PARTIAL();
4095  }
4096  switch(*eptr++)
4097  {
4098  default: MRRETURN(MATCH_NOMATCH);
4099  case 0x09: /* HT */
4100  case 0x20: /* SPACE */
4101  case 0xa0: /* NBSP */
4102  break;
4103  }
4104  }
4105  break;
4106 
4107  case OP_NOT_VSPACE:
4108  for (i = 1; i <= min; i++)
4109  {
4110  if (eptr >= md->end_subject)
4111  {
4112  SCHECK_PARTIAL();
4114  }
4115  switch(*eptr++)
4116  {
4117  default: break;
4118  case 0x0a: /* LF */
4119  case 0x0b: /* VT */
4120  case 0x0c: /* FF */
4121  case 0x0d: /* CR */
4122  case 0x85: /* NEL */
4124  }
4125  }
4126  break;
4127 
4128  case OP_VSPACE:
4129  for (i = 1; i <= min; i++)
4130  {
4131  if (eptr >= md->end_subject)
4132  {
4133  SCHECK_PARTIAL();
4135  }
4136  switch(*eptr++)
4137  {
4138  default: MRRETURN(MATCH_NOMATCH);
4139  case 0x0a: /* LF */
4140  case 0x0b: /* VT */
4141  case 0x0c: /* FF */
4142  case 0x0d: /* CR */
4143  case 0x85: /* NEL */
4144  break;
4145  }
4146  }
4147  break;
4148 
4149  case OP_NOT_DIGIT:
4150  for (i = 1; i <= min; i++)
4151  {
4152  if (eptr >= md->end_subject)
4153  {
4154  SCHECK_PARTIAL();
4156  }
4157  if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4158  }
4159  break;
4160 
4161  case OP_DIGIT:
4162  for (i = 1; i <= min; i++)
4163  {
4164  if (eptr >= md->end_subject)
4165  {
4166  SCHECK_PARTIAL();
4168  }
4169  if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4170  }
4171  break;
4172 
4173  case OP_NOT_WHITESPACE:
4174  for (i = 1; i <= min; i++)
4175  {
4176  if (eptr >= md->end_subject)
4177  {
4178  SCHECK_PARTIAL();
4180  }
4181  if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4182  }
4183  break;
4184 
4185  case OP_WHITESPACE:
4186  for (i = 1; i <= min; i++)
4187  {
4188  if (eptr >= md->end_subject)
4189  {
4190  SCHECK_PARTIAL();
4192  }
4193  if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4194  }
4195  break;
4196 
4197  case OP_NOT_WORDCHAR:
4198  for (i = 1; i <= min; i++)
4199  {
4200  if (eptr >= md->end_subject)
4201  {
4202  SCHECK_PARTIAL();
4204  }
4205  if ((md->ctypes[*eptr++] & ctype_word) != 0)
4207  }
4208  break;
4209 
4210  case OP_WORDCHAR:
4211  for (i = 1; i <= min; i++)
4212  {
4213  if (eptr >= md->end_subject)
4214  {
4215  SCHECK_PARTIAL();
4217  }
4218  if ((md->ctypes[*eptr++] & ctype_word) == 0)
4220  }
4221  break;
4222 
4223  default:
4225  }
4226  }
4227 
4228  /* If min = max, continue at the same level without recursing */
4229 
4230  if (min == max) continue;
4231 
4232  /* If minimizing, we have to test the rest of the pattern before each
4233  subsequent match. Again, separate the UTF-8 case for speed, and also
4234  separate the UCP cases. */
4235 
4236  if (minimize)
4237  {
4238 #ifdef SUPPORT_UCP
4239  if (prop_type >= 0)
4240  {
4241  switch(prop_type)
4242  {
4243  case PT_ANY:
4244  for (fi = min;; fi++)
4245  {
4246  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4247  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4248  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4249  if (eptr >= md->end_subject)
4250  {
4251  SCHECK_PARTIAL();
4253  }
4254  GETCHARINCTEST(c, eptr);
4255  if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4256  }
4257  /* Control never gets here */
4258 
4259  case PT_LAMP:
4260  for (fi = min;; fi++)
4261  {
4262  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4263  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4264  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4265  if (eptr >= md->end_subject)
4266  {
4267  SCHECK_PARTIAL();
4269  }
4270  GETCHARINCTEST(c, eptr);
4271  prop_chartype = UCD_CHARTYPE(c);
4272  if ((prop_chartype == ucp_Lu ||
4273  prop_chartype == ucp_Ll ||
4274  prop_chartype == ucp_Lt) == prop_fail_result)
4276  }
4277  /* Control never gets here */
4278 
4279  case PT_GC:
4280  for (fi = min;; fi++)
4281  {
4282  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4283  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4284  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4285  if (eptr >= md->end_subject)
4286  {
4287  SCHECK_PARTIAL();
4289  }
4290  GETCHARINCTEST(c, eptr);
4291  prop_category = UCD_CATEGORY(c);
4292  if ((prop_category == prop_value) == prop_fail_result)
4294  }
4295  /* Control never gets here */
4296 
4297  case PT_PC:
4298  for (fi = min;; fi++)
4299  {
4300  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4301  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4302  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4303  if (eptr >= md->end_subject)
4304  {
4305  SCHECK_PARTIAL();
4307  }
4308  GETCHARINCTEST(c, eptr);
4309  prop_chartype = UCD_CHARTYPE(c);
4310  if ((prop_chartype == prop_value) == prop_fail_result)
4312  }
4313  /* Control never gets here */
4314 
4315  case PT_SC:
4316  for (fi = min;; fi++)
4317  {
4318  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4319  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4320  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4321  if (eptr >= md->end_subject)
4322  {
4323  SCHECK_PARTIAL();
4325  }
4326  GETCHARINCTEST(c, eptr);
4327  prop_script = UCD_SCRIPT(c);
4328  if ((prop_script == prop_value) == prop_fail_result)
4330  }
4331  /* Control never gets here */
4332 
4333  case PT_ALNUM:
4334  for (fi = min;; fi++)
4335  {
4336  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4337  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4338  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4339  if (eptr >= md->end_subject)
4340  {
4341  SCHECK_PARTIAL();
4343  }
4344  GETCHARINCTEST(c, eptr);
4345  prop_category = UCD_CATEGORY(c);
4346  if ((prop_category == ucp_L || prop_category == ucp_N)
4347  == prop_fail_result)
4349  }
4350  /* Control never gets here */
4351 
4352  case PT_SPACE: /* Perl space */
4353  for (fi = min;; fi++)
4354  {
4355  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4356  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4357  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4358  if (eptr >= md->end_subject)
4359  {
4360  SCHECK_PARTIAL();
4362  }
4363  GETCHARINCTEST(c, eptr);
4364  prop_category = UCD_CATEGORY(c);
4365  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4366  c == CHAR_FF || c == CHAR_CR)
4367  == prop_fail_result)
4369  }
4370  /* Control never gets here */
4371 
4372  case PT_PXSPACE: /* POSIX space */
4373  for (fi = min;; fi++)
4374  {
4375  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4376  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4377  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4378  if (eptr >= md->end_subject)
4379  {
4380  SCHECK_PARTIAL();
4382  }
4383  GETCHARINCTEST(c, eptr);
4384  prop_category = UCD_CATEGORY(c);
4385  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4386  c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4387  == prop_fail_result)
4389  }
4390  /* Control never gets here */
4391 
4392  case PT_WORD:
4393  for (fi = min;; fi++)
4394  {
4395  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4396  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4397  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4398  if (eptr >= md->end_subject)
4399  {
4400  SCHECK_PARTIAL();
4402  }
4403  GETCHARINCTEST(c, eptr);
4404  prop_category = UCD_CATEGORY(c);
4405  if ((prop_category == ucp_L ||
4406  prop_category == ucp_N ||
4407  c == CHAR_UNDERSCORE)
4408  == prop_fail_result)
4410  }
4411  /* Control never gets here */
4412 
4413  /* This should never occur */
4414 
4415  default:
4417  }
4418  }
4419 
4420  /* Match extended Unicode sequences. We will get here only if the
4421  support is in the binary; otherwise a compile-time error occurs. */
4422 
4423  else if (ctype == OP_EXTUNI)
4424  {
4425  for (fi = min;; fi++)
4426  {
4427  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4428  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4429  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4430  if (eptr >= md->end_subject)
4431  {
4432  SCHECK_PARTIAL();
4434  }
4435  GETCHARINCTEST(c, eptr);
4436  prop_category = UCD_CATEGORY(c);
4437  if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4438  while (eptr < md->end_subject)
4439  {
4440  int len = 1;
4441  if (!utf8) c = *eptr;
4442  else { GETCHARLEN(c, eptr, len); }
4443  prop_category = UCD_CATEGORY(c);
4444  if (prop_category != ucp_M) break;
4445  eptr += len;
4446  }
4447  }
4448  }
4449 
4450  else
4451 #endif /* SUPPORT_UCP */
4452 
4453 #ifdef SUPPORT_UTF8
4454  /* UTF-8 mode */
4455  if (utf8)
4456  {
4457  for (fi = min;; fi++)
4458  {
4459  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4460  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4461  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4462  if (eptr >= md->end_subject)
4463  {
4464  SCHECK_PARTIAL();
4466  }
4467  if (ctype == OP_ANY && IS_NEWLINE(eptr))
4469  GETCHARINC(c, eptr);
4470  switch(ctype)
4471  {
4472  case OP_ANY: /* This is the non-NL case */
4473  case OP_ALLANY:
4474  case OP_ANYBYTE:
4475  break;
4476 
4477  case OP_ANYNL:
4478  switch(c)
4479  {
4480  default: MRRETURN(MATCH_NOMATCH);
4481  case 0x000d:
4482  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4483  break;
4484  case 0x000a:
4485  break;
4486 
4487  case 0x000b:
4488  case 0x000c:
4489  case 0x0085:
4490  case 0x2028:
4491  case 0x2029:
4493  break;
4494  }
4495  break;
4496 
4497  case OP_NOT_HSPACE:
4498  switch(c)
4499  {
4500  default: break;
4501  case 0x09: /* HT */
4502  case 0x20: /* SPACE */
4503  case 0xa0: /* NBSP */
4504  case 0x1680: /* OGHAM SPACE MARK */
4505  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4506  case 0x2000: /* EN QUAD */
4507  case 0x2001: /* EM QUAD */
4508  case 0x2002: /* EN SPACE */
4509  case 0x2003: /* EM SPACE */
4510  case 0x2004: /* THREE-PER-EM SPACE */
4511  case 0x2005: /* FOUR-PER-EM SPACE */
4512  case 0x2006: /* SIX-PER-EM SPACE */
4513  case 0x2007: /* FIGURE SPACE */
4514  case 0x2008: /* PUNCTUATION SPACE */
4515  case 0x2009: /* THIN SPACE */
4516  case 0x200A: /* HAIR SPACE */
4517  case 0x202f: /* NARROW NO-BREAK SPACE */
4518  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4519  case 0x3000: /* IDEOGRAPHIC SPACE */
4521  }
4522  break;
4523 
4524  case OP_HSPACE:
4525  switch(c)
4526  {
4527  default: MRRETURN(MATCH_NOMATCH);
4528  case 0x09: /* HT */
4529  case 0x20: /* SPACE */
4530  case 0xa0: /* NBSP */
4531  case 0x1680: /* OGHAM SPACE MARK */
4532  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4533  case 0x2000: /* EN QUAD */
4534  case 0x2001: /* EM QUAD */
4535  case 0x2002: /* EN SPACE */
4536  case 0x2003: /* EM SPACE */
4537  case 0x2004: /* THREE-PER-EM SPACE */
4538  case 0x2005: /* FOUR-PER-EM SPACE */
4539  case 0x2006: /* SIX-PER-EM SPACE */
4540  case 0x2007: /* FIGURE SPACE */
4541  case 0x2008: /* PUNCTUATION SPACE */
4542  case 0x2009: /* THIN SPACE */
4543  case 0x200A: /* HAIR SPACE */
4544  case 0x202f: /* NARROW NO-BREAK SPACE */
4545  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4546  case 0x3000: /* IDEOGRAPHIC SPACE */
4547  break;
4548  }
4549  break;
4550 
4551  case OP_NOT_VSPACE:
4552  switch(c)
4553  {
4554  default: break;
4555  case 0x0a: /* LF */
4556  case 0x0b: /* VT */
4557  case 0x0c: /* FF */
4558  case 0x0d: /* CR */
4559  case 0x85: /* NEL */
4560  case 0x2028: /* LINE SEPARATOR */
4561  case 0x2029: /* PARAGRAPH SEPARATOR */
4563  }
4564  break;
4565 
4566  case OP_VSPACE:
4567  switch(c)
4568  {
4569  default: MRRETURN(MATCH_NOMATCH);
4570  case 0x0a: /* LF */
4571  case 0x0b: /* VT */
4572  case 0x0c: /* FF */
4573  case 0x0d: /* CR */
4574  case 0x85: /* NEL */
4575  case 0x2028: /* LINE SEPARATOR */
4576  case 0x2029: /* PARAGRAPH SEPARATOR */
4577  break;
4578  }
4579  break;
4580 
4581  case OP_NOT_DIGIT:
4582  if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4584  break;
4585 
4586  case OP_DIGIT:
4587  if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4589  break;
4590 
4591  case OP_NOT_WHITESPACE:
4592  if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4594  break;
4595 
4596  case OP_WHITESPACE:
4597  if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4599  break;
4600 
4601  case OP_NOT_WORDCHAR:
4602  if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4604  break;
4605 
4606  case OP_WORDCHAR:
4607  if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4609  break;
4610 
4611  default:
4613  }
4614  }
4615  }
4616  else
4617 #endif
4618  /* Not UTF-8 mode */
4619  {
4620  for (fi = min;; fi++)
4621  {
4622  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4623  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4624  if (fi >= max) MRRETURN(MATCH_NOMATCH);
4625  if (eptr >= md->end_subject)
4626  {
4627  SCHECK_PARTIAL();
4629  }
4630  if (ctype == OP_ANY && IS_NEWLINE(eptr))
4632  c = *eptr++;
4633  switch(ctype)
4634  {
4635  case OP_ANY: /* This is the non-NL case */
4636  case OP_ALLANY:
4637  case OP_ANYBYTE:
4638  break;
4639 
4640  case OP_ANYNL:
4641  switch(c)
4642  {
4643  default: MRRETURN(MATCH_NOMATCH);
4644  case 0x000d:
4645  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4646  break;
4647 
4648  case 0x000a:
4649  break;
4650 
4651  case 0x000b:
4652  case 0x000c:
4653  case 0x0085:
4655  break;
4656  }
4657  break;
4658 
4659  case OP_NOT_HSPACE:
4660  switch(c)
4661  {
4662  default: break;
4663  case 0x09: /* HT */
4664  case 0x20: /* SPACE */
4665  case 0xa0: /* NBSP */
4667  }
4668  break;
4669 
4670  case OP_HSPACE:
4671  switch(c)
4672  {
4673  default: MRRETURN(MATCH_NOMATCH);
4674  case 0x09: /* HT */
4675  case 0x20: /* SPACE */
4676  case 0xa0: /* NBSP */
4677  break;
4678  }
4679  break;
4680 
4681  case OP_NOT_VSPACE:
4682  switch(c)
4683  {
4684  default: break;
4685  case 0x0a: /* LF */
4686  case 0x0b: /* VT */
4687  case 0x0c: /* FF */
4688  case 0x0d: /* CR */
4689  case 0x85: /* NEL */
4691  }
4692  break;
4693 
4694  case OP_VSPACE:
4695  switch(c)
4696  {
4697  default: MRRETURN(MATCH_NOMATCH);
4698  case 0x0a: /* LF */
4699  case 0x0b: /* VT */
4700  case 0x0c: /* FF */
4701  case 0x0d: /* CR */
4702  case 0x85: /* NEL */
4703  break;
4704  }
4705  break;
4706 
4707  case OP_NOT_DIGIT:
4708  if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4709  break;
4710 
4711  case OP_DIGIT:
4712  if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4713  break;
4714 
4715  case OP_NOT_WHITESPACE:
4716  if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4717  break;
4718 
4719  case OP_WHITESPACE:
4720  if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4721  break;
4722 
4723  case OP_NOT_WORDCHAR:
4724  if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4725  break;
4726 
4727  case OP_WORDCHAR:
4728  if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4729  break;
4730 
4731  default:
4733  }
4734  }
4735  }
4736  /* Control never gets here */
4737  }
4738 
4739  /* If maximizing, it is worth using inline code for speed, doing the type
4740  test once at the start (i.e. keep it out of the loop). Again, keep the
4741  UTF-8 and UCP stuff separate. */
4742 
4743  else
4744  {
4745  pp = eptr; /* Remember where we started */
4746 
4747 #ifdef SUPPORT_UCP
4748  if (prop_type >= 0)
4749  {
4750  switch(prop_type)
4751  {
4752  case PT_ANY:
4753  for (i = min; i < max; i++)
4754  {
4755  int len = 1;
4756  if (eptr >= md->end_subject)
4757  {
4758  SCHECK_PARTIAL();
4759  break;
4760  }
4761  GETCHARLENTEST(c, eptr, len);
4762  if (prop_fail_result) break;
4763  eptr+= len;
4764  }
4765  break;
4766 
4767  case PT_LAMP:
4768  for (i = min; i < max; i++)
4769  {
4770  int len = 1;
4771  if (eptr >= md->end_subject)
4772  {
4773  SCHECK_PARTIAL();
4774  break;
4775  }
4776  GETCHARLENTEST(c, eptr, len);
4777  prop_chartype = UCD_CHARTYPE(c);
4778  if ((prop_chartype == ucp_Lu ||
4779  prop_chartype == ucp_Ll ||
4780  prop_chartype == ucp_Lt) == prop_fail_result)
4781  break;
4782  eptr+= len;
4783  }
4784  break;
4785 
4786  case PT_GC:
4787  for (i = min; i < max; i++)
4788  {
4789  int len = 1;
4790  if (eptr >= md->end_subject)
4791  {
4792  SCHECK_PARTIAL();
4793  break;
4794  }
4795  GETCHARLENTEST(c, eptr, len);
4796  prop_category = UCD_CATEGORY(c);
4797  if ((prop_category == prop_value) == prop_fail_result)
4798  break;
4799  eptr+= len;
4800  }
4801  break;
4802 
4803  case PT_PC:
4804  for (i = min; i < max; i++)
4805  {
4806  int len = 1;
4807  if (eptr >= md->end_subject)
4808  {
4809  SCHECK_PARTIAL();
4810  break;
4811  }
4812  GETCHARLENTEST(c, eptr, len);
4813  prop_chartype = UCD_CHARTYPE(c);
4814  if ((prop_chartype == prop_value) == prop_fail_result)
4815  break;
4816  eptr+= len;
4817  }
4818  break;
4819 
4820  case PT_SC:
4821  for (i = min; i < max; i++)
4822  {
4823  int len = 1;
4824  if (eptr >= md->end_subject)
4825  {
4826  SCHECK_PARTIAL();
4827  break;
4828  }
4829  GETCHARLENTEST(c, eptr, len);
4830  prop_script = UCD_SCRIPT(c);
4831  if ((prop_script == prop_value) == prop_fail_result)
4832  break;
4833  eptr+= len;
4834  }
4835  break;
4836 
4837  case PT_ALNUM:
4838  for (i = min; i < max; i++)
4839  {
4840  int len = 1;
4841  if (eptr >= md->end_subject)
4842  {
4843  SCHECK_PARTIAL();
4844  break;
4845  }
4846  GETCHARLENTEST(c, eptr, len);
4847  prop_category = UCD_CATEGORY(c);
4848  if ((prop_category == ucp_L || prop_category == ucp_N)
4849  == prop_fail_result)
4850  break;
4851  eptr+= len;
4852  }
4853  break;
4854 
4855  case PT_SPACE: /* Perl space */
4856  for (i = min; i < max; i++)
4857  {
4858  int len = 1;
4859  if (eptr >= md->end_subject)
4860  {
4861  SCHECK_PARTIAL();
4862  break;
4863  }
4864  GETCHARLENTEST(c, eptr, len);
4865  prop_category = UCD_CATEGORY(c);
4866  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4867  c == CHAR_FF || c == CHAR_CR)
4868  == prop_fail_result)
4869  break;
4870  eptr+= len;
4871  }
4872  break;
4873 
4874  case PT_PXSPACE: /* POSIX space */
4875  for (i = min; i < max; i++)
4876  {
4877  int len = 1;
4878  if (eptr >= md->end_subject)
4879  {
4880  SCHECK_PARTIAL();
4881  break;
4882  }
4883  GETCHARLENTEST(c, eptr, len);
4884  prop_category = UCD_CATEGORY(c);
4885  if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4886  c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4887  == prop_fail_result)
4888  break;
4889  eptr+= len;
4890  }
4891  break;
4892 
4893  case PT_WORD:
4894  for (i = min; i < max; i++)
4895  {
4896  int len = 1;
4897  if (eptr >= md->end_subject)
4898  {
4899  SCHECK_PARTIAL();
4900  break;
4901  }
4902  GETCHARLENTEST(c, eptr, len);
4903  prop_category = UCD_CATEGORY(c);
4904  if ((prop_category == ucp_L || prop_category == ucp_N ||
4905  c == CHAR_UNDERSCORE) == prop_fail_result)
4906  break;
4907  eptr+= len;
4908  }
4909  break;
4910 
4911  default:
4913  }
4914 
4915  /* eptr is now past the end of the maximum run */
4916 
4917  if (possessive) continue;
4918  for(;;)
4919  {
4920  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4921  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4922  if (eptr-- == pp) break; /* Stop if tried at original pos */
4923  if (utf8) BACKCHAR(eptr);
4924  }
4925  }
4926 
4927  /* Match extended Unicode sequences. We will get here only if the
4928  support is in the binary; otherwise a compile-time error occurs. */
4929 
4930  else if (ctype == OP_EXTUNI)
4931  {
4932  for (i = min; i < max; i++)
4933  {
4934  if (eptr >= md->end_subject)
4935  {
4936  SCHECK_PARTIAL();
4937  break;
4938  }
4939  GETCHARINCTEST(c, eptr);
4940  prop_category = UCD_CATEGORY(c);
4941  if (prop_category == ucp_M) break;
4942  while (eptr < md->end_subject)
4943  {
4944  int len = 1;
4945  if (!utf8) c = *eptr; else
4946  {
4947  GETCHARLEN(c, eptr, len);
4948  }
4949  prop_category = UCD_CATEGORY(c);
4950  if (prop_category != ucp_M) break;
4951  eptr += len;
4952  }
4953  }
4954 
4955  /* eptr is now past the end of the maximum run */
4956 
4957  if (possessive) continue;
4958 
4959  for(;;)
4960  {
4961  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4962  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4963  if (eptr-- == pp) break; /* Stop if tried at original pos */
4964  for (;;) /* Move back over one extended */
4965  {
4966  int len = 1;
4967  if (!utf8) c = *eptr; else
4968  {
4969  BACKCHAR(eptr);
4970  GETCHARLEN(c, eptr, len);
4971  }
4972  prop_category = UCD_CATEGORY(c);
4973  if (prop_category != ucp_M) break;
4974  eptr--;
4975  }
4976  }
4977  }
4978 
4979  else
4980 #endif /* SUPPORT_UCP */
4981 
4982 #ifdef SUPPORT_UTF8
4983  /* UTF-8 mode */
4984 
4985  if (utf8)
4986  {
4987  switch(ctype)
4988  {
4989  case OP_ANY:
4990  if (max < INT_MAX)
4991  {
4992  for (i = min; i < max; i++)
4993  {
4994  if (eptr >= md->end_subject)
4995  {
4996  SCHECK_PARTIAL();
4997  break;
4998  }
4999  if (IS_NEWLINE(eptr)) break;
5000  eptr++;
5001  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5002  }
5003  }
5004 
5005  /* Handle unlimited UTF-8 repeat */
5006 
5007  else
5008  {
5009  for (i = min; i < max; i++)
5010  {
5011  if (eptr >= md->end_subject)
5012  {
5013  SCHECK_PARTIAL();
5014  break;
5015  }
5016  if (IS_NEWLINE(eptr)) break;
5017  eptr++;
5018  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5019  }
5020  }
5021  break;
5022 
5023  case OP_ALLANY:
5024  if (max < INT_MAX)
5025  {
5026  for (i = min; i < max; i++)
5027  {
5028  if (eptr >= md->end_subject)
5029  {
5030  SCHECK_PARTIAL();
5031  break;
5032  }
5033  eptr++;
5034  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5035  }
5036  }
5037  else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5038  break;
5039 
5040  /* The byte case is the same as non-UTF8 */
5041 
5042  case OP_ANYBYTE:
5043  c = max - min;
5044  if (c > (unsigned int)(md->end_subject - eptr))
5045  {
5046  eptr = md->end_subject;
5047  SCHECK_PARTIAL();
5048  }
5049  else eptr += c;
5050  break;
5051 
5052  case OP_ANYNL:
5053  for (i = min; i < max; i++)
5054  {
5055  int len = 1;
5056  if (eptr >= md->end_subject)
5057  {
5058  SCHECK_PARTIAL();
5059  break;
5060  }
5061  GETCHARLEN(c, eptr, len);
5062  if (c == 0x000d)
5063  {
5064  if (++eptr >= md->end_subject) break;
5065  if (*eptr == 0x000a) eptr++;
5066  }
5067  else
5068  {
5069  if (c != 0x000a &&
5070  (md->bsr_anycrlf ||
5071  (c != 0x000b && c != 0x000c &&
5072  c != 0x0085 && c != 0x2028 && c != 0x2029)))
5073  break;
5074  eptr += len;
5075  }
5076  }
5077  break;
5078 
5079  case OP_NOT_HSPACE:
5080  case OP_HSPACE:
5081  for (i = min; i < max; i++)
5082  {
5083  BOOL gotspace;
5084  int len = 1;
5085  if (eptr >= md->end_subject)
5086  {
5087  SCHECK_PARTIAL();
5088  break;
5089  }
5090  GETCHARLEN(c, eptr, len);
5091  switch(c)
5092  {
5093  default: gotspace = FALSE; break;
5094  case 0x09: /* HT */
5095  case 0x20: /* SPACE */
5096  case 0xa0: /* NBSP */
5097  case 0x1680: /* OGHAM SPACE MARK */
5098  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5099  case 0x2000: /* EN QUAD */
5100  case 0x2001: /* EM QUAD */
5101  case 0x2002: /* EN SPACE */
5102  case 0x2003: /* EM SPACE */
5103  case 0x2004: /* THREE-PER-EM SPACE */
5104  case 0x2005: /* FOUR-PER-EM SPACE */
5105  case 0x2006: /* SIX-PER-EM SPACE */
5106  case 0x2007: /* FIGURE SPACE */
5107  case 0x2008: /* PUNCTUATION SPACE */
5108  case 0x2009: /* THIN SPACE */
5109  case 0x200A: /* HAIR SPACE */
5110  case 0x202f: /* NARROW NO-BREAK SPACE */
5111  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5112  case 0x3000: /* IDEOGRAPHIC SPACE */
5113  gotspace = TRUE;
5114  break;
5115  }
5116  if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5117  eptr += len;
5118  }
5119  break;
5120 
5121  case OP_NOT_VSPACE:
5122  case OP_VSPACE:
5123  for (i = min; i < max; i++)
5124  {
5125  BOOL gotspace;
5126  int len = 1;
5127  if (eptr >= md->end_subject)
5128  {
5129  SCHECK_PARTIAL();
5130  break;
5131  }
5132  GETCHARLEN(c, eptr, len);
5133  switch(c)
5134  {
5135  default: gotspace = FALSE; break;
5136  case 0x0a: /* LF */
5137  case 0x0b: /* VT */
5138  case 0x0c: /* FF */
5139  case 0x0d: /* CR */
5140  case 0x85: /* NEL */
5141  case 0x2028: /* LINE SEPARATOR */
5142  case 0x2029: /* PARAGRAPH SEPARATOR */
5143  gotspace = TRUE;
5144  break;
5145  }
5146  if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5147  eptr += len;
5148  }
5149  break;
5150 
5151  case OP_NOT_DIGIT:
5152  for (i = min; i < max; i++)
5153  {
5154  int len = 1;
5155  if (eptr >= md->end_subject)
5156  {
5157  SCHECK_PARTIAL();
5158  break;
5159  }
5160  GETCHARLEN(c, eptr, len);
5161  if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5162  eptr+= len;
5163  }
5164  break;
5165 
5166  case OP_DIGIT:
5167  for (i = min; i < max; i++)
5168  {
5169  int len = 1;
5170  if (eptr >= md->end_subject)
5171  {
5172  SCHECK_PARTIAL();
5173  break;
5174  }
5175  GETCHARLEN(c, eptr, len);
5176  if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5177  eptr+= len;
5178  }
5179  break;
5180 
5181  case OP_NOT_WHITESPACE:
5182  for (i = min; i < max; i++)
5183  {
5184  int len = 1;
5185  if (eptr >= md->end_subject)
5186  {
5187  SCHECK_PARTIAL();
5188  break;
5189  }
5190  GETCHARLEN(c, eptr, len);
5191  if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5192  eptr+= len;
5193  }
5194  break;
5195 
5196  case OP_WHITESPACE:
5197  for (i = min; i < max; i++)
5198  {
5199  int len = 1;
5200  if (eptr >= md->end_subject)
5201  {
5202  SCHECK_PARTIAL();
5203  break;
5204  }
5205  GETCHARLEN(c, eptr, len);
5206  if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5207  eptr+= len;
5208  }
5209  break;
5210 
5211  case OP_NOT_WORDCHAR:
5212  for (i = min; i < max; i++)
5213  {
5214  int len = 1;
5215  if (eptr >= md->end_subject)
5216  {
5217  SCHECK_PARTIAL();
5218  break;
5219  }
5220  GETCHARLEN(c, eptr, len);
5221  if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5222  eptr+= len;
5223  }
5224  break;
5225 
5226  case OP_WORDCHAR:
5227  for (i = min; i < max; i++)
5228  {
5229  int len = 1;
5230  if (eptr >= md->end_subject)
5231  {
5232  SCHECK_PARTIAL();
5233  break;
5234  }
5235  GETCHARLEN(c, eptr, len);
5236  if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5237  eptr+= len;
5238  }
5239  break;
5240 
5241  default:
5243  }
5244 
5245  /* eptr is now past the end of the maximum run */
5246 
5247  if (possessive) continue;
5248  for(;;)
5249  {
5250  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5251  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5252  if (eptr-- == pp) break; /* Stop if tried at original pos */
5253  BACKCHAR(eptr);
5254  }
5255  }
5256  else
5257 #endif /* SUPPORT_UTF8 */
5258 
5259  /* Not UTF-8 mode */
5260  {
5261  switch(ctype)
5262  {
5263  case OP_ANY:
5264  for (i = min; i < max; i++)
5265  {
5266  if (eptr >= md->end_subject)
5267  {
5268  SCHECK_PARTIAL();
5269  break;
5270  }
5271  if (IS_NEWLINE(eptr)) break;
5272  eptr++;
5273  }
5274  break;
5275 
5276  case OP_ALLANY:
5277  case OP_ANYBYTE:
5278  c = max - min;
5279  if (c > (unsigned int)(md->end_subject - eptr))
5280  {
5281  eptr = md->end_subject;
5282  SCHECK_PARTIAL();
5283  }
5284  else eptr += c;
5285  break;
5286 
5287  case OP_ANYNL:
5288  for (i = min; i < max; i++)
5289  {
5290  if (eptr >= md->end_subject)
5291  {
5292  SCHECK_PARTIAL();
5293  break;
5294  }
5295  c = *eptr;
5296  if (c == 0x000d)
5297  {
5298  if (++eptr >= md->end_subject) break;
5299  if (*eptr == 0x000a) eptr++;
5300  }
5301  else
5302  {
5303  if (c != 0x000a &&
5304  (md->bsr_anycrlf ||
5305  (c != 0x000b && c != 0x000c && c != 0x0085)))
5306  break;
5307  eptr++;
5308  }
5309  }
5310  break;
5311 
5312  case OP_NOT_HSPACE:
5313  for (i = min; i < max; i++)
5314  {
5315  if (eptr >= md->end_subject)
5316  {
5317  SCHECK_PARTIAL();
5318  break;
5319  }
5320  c = *eptr;
5321  if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5322  eptr++;
5323  }
5324  break;
5325 
5326  case OP_HSPACE:
5327  for (i = min; i < max; i++)
5328  {
5329  if (eptr >= md->end_subject)
5330  {
5331  SCHECK_PARTIAL();
5332  break;
5333  }
5334  c = *eptr;
5335  if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5336  eptr++;
5337  }
5338  break;
5339 
5340  case OP_NOT_VSPACE:
5341  for (i = min; i < max; i++)
5342  {
5343  if (eptr >= md->end_subject)
5344  {
5345  SCHECK_PARTIAL();
5346  break;
5347  }
5348  c = *eptr;
5349  if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5350  break;
5351  eptr++;
5352  }
5353  break;
5354 
5355  case OP_VSPACE:
5356  for (i = min; i < max; i++)
5357  {
5358  if (eptr >= md->end_subject)
5359  {
5360  SCHECK_PARTIAL();
5361  break;
5362  }
5363  c = *eptr;
5364  if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5365  break;
5366  eptr++;
5367  }
5368  break;
5369 
5370  case OP_NOT_DIGIT:
5371  for (i = min; i < max; i++)
5372  {
5373  if (eptr >= md->end_subject)
5374  {
5375  SCHECK_PARTIAL();
5376  break;
5377  }
5378  if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5379  eptr++;
5380  }
5381  break;
5382 
5383  case OP_DIGIT:
5384  for (i = min; i < max; i++)
5385  {
5386  if (eptr >= md->end_subject)
5387  {
5388  SCHECK_PARTIAL();
5389  break;
5390  }
5391  if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5392  eptr++;
5393  }
5394  break;
5395 
5396  case OP_NOT_WHITESPACE:
5397  for (i = min; i < max; i++)
5398  {
5399  if (eptr >= md->end_subject)
5400  {
5401  SCHECK_PARTIAL();
5402  break;
5403  }
5404  if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5405  eptr++;
5406  }
5407  break;
5408 
5409  case OP_WHITESPACE:
5410  for (i = min; i < max; i++)
5411  {
5412  if (eptr >= md->end_subject)
5413  {
5414  SCHECK_PARTIAL();
5415  break;
5416  }
5417  if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5418  eptr++;
5419  }
5420  break;
5421 
5422  case OP_NOT_WORDCHAR:
5423  for (i = min; i < max; i++)
5424  {
5425  if (eptr >= md->end_subject)
5426  {
5427  SCHECK_PARTIAL();
5428  break;
5429  }
5430  if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5431  eptr++;
5432  }
5433  break;
5434 
5435  case OP_WORDCHAR:
5436  for (i = min; i < max; i++)
5437  {
5438  if (eptr >= md->end_subject)
5439  {
5440  SCHECK_PARTIAL();
5441  break;
5442  }
5443  if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5444  eptr++;
5445  }
5446  break;
5447 
5448  default:
5450  }
5451 
5452  /* eptr is now past the end of the maximum run */
5453 
5454  if (possessive) continue;
5455  while (eptr >= pp)
5456  {
5457  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5458  eptr--;
5459  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5460  }
5461  }
5462 
5463  /* Get here if we can't make it match with any permitted repetitions */
5464 
5466  }
5467  /* Control never gets here */
5468 
5469  /* There's been some horrible disaster. Arrival here can only mean there is
5470  something seriously wrong in the code above or the OP_xxx definitions. */
5471 
5472  default:
5473  DPRINTF(("Unknown opcode %d\n", *ecode));
5475  }
5476 
5477  /* Do not stick any code in here without much thought; it is assumed
5478  that "continue" in the code above comes out to here to repeat the main
5479  loop. */
5480 
5481  } /* End of main loop */
5482 /* Control never reaches here */
5483 
5484 
5485 /* When compiling to use the heap rather than the stack for recursive calls to
5486 match(), the RRETURN() macro jumps here. The number that is saved in
5487 frame->Xwhere indicates which label we actually want to return to. */
5488 
5489 #ifdef NO_RECURSE
5490 #define LBL(val) case val: goto L_RM##val;
5491 HEAP_RETURN:
5492 switch (frame->Xwhere)
5493  {
5494  LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5495  LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5496  LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5497  LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5498  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5499 #ifdef SUPPORT_UTF8
5500  LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5501  LBL(32) LBL(34) LBL(42) LBL(46)
5502 #ifdef SUPPORT_UCP
5503  LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5504  LBL(59) LBL(60) LBL(61) LBL(62)
5505 #endif /* SUPPORT_UCP */
5506 #endif /* SUPPORT_UTF8 */
5507  default:
5508  DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5509  return PCRE_ERROR_INTERNAL;
5510  }
5511 #undef LBL
5512 #endif /* NO_RECURSE */
5513 }
5514 
5515 
5516 /***************************************************************************
5517 ****************************************************************************
5518  RECURSION IN THE match() FUNCTION
5519 
5520 Undefine all the macros that were defined above to handle this. */
5521 
5522 #ifdef NO_RECURSE
5523 #undef eptr
5524 #undef ecode
5525 #undef mstart
5526 #undef offset_top
5527 #undef ims
5528 #undef eptrb
5529 #undef flags
5530 
5531 #undef callpat
5532 #undef charptr
5533 #undef data
5534 #undef next
5535 #undef pp
5536 #undef prev
5537 #undef saved_eptr
5538 
5539 #undef new_recursive
5540 
5541 #undef cur_is_word
5542 #undef condition
5543 #undef prev_is_word
5544 
5545 #undef original_ims
5546 
5547 #undef ctype
5548 #undef length
5549 #undef max
5550 #undef min
5551 #undef number
5552 #undef offset
5553 #undef op
5554 #undef save_capture_last
5555 #undef save_offset1
5556 #undef save_offset2
5557 #undef save_offset3
5558 #undef stacksave
5559 
5560 #undef newptrb
5561 
5562 #endif
5563 
5564 /* These two are defined as macros in both cases */
5565 
5566 #undef fc
5567 #undef fi
5568 
5569 /***************************************************************************
5570 ***************************************************************************/
5571 
5572 
5573 
5574 /*************************************************
5575 * Execute a Regular Expression *
5576 *************************************************/
5577 
5578 /* This function applies a compiled re to a subject string and picks out
5579 portions of the string if it matches. Two elements in the vector are set for
5580 each substring: the offsets to the start and end of the substring.
5581 
5582 Arguments:
5583  argument_re points to the compiled expression
5584  extra_data points to extra data or is NULL
5585  subject points to the subject string
5586  length length of subject string (may contain binary zeros)
5587  start_offset where to start in the subject string
5588  options option bits
5589  offsets points to a vector of ints to be filled in with offsets
5590  offsetcount the number of elements in the vector
5591 
5592 Returns: > 0 => success; value is the number of elements filled in
5593  = 0 => success, but offsets is not big enough
5594  -1 => failed to match
5595  < -1 => some kind of unexpected problem
5596 */
5597 
5599 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5600  PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5601  int offsetcount)
5602 {
5603 int rc, resetcount, ocount;
5604 int first_byte = -1;
5605 int req_byte = -1;
5606 int req_byte2 = -1;
5607 int newline;
5608 unsigned long int ims;
5609 BOOL using_temporary_offsets = FALSE;
5610 BOOL anchored;
5611 BOOL startline;
5612 BOOL firstline;
5613 BOOL first_byte_caseless = FALSE;
5614 BOOL req_byte_caseless = FALSE;
5615 BOOL utf8;
5616 match_data match_block;
5617 match_data *md = &match_block;
5618 const uschar *tables;
5619 const uschar *start_bits = NULL;
5620 USPTR start_match = (USPTR)subject + start_offset;
5621 USPTR end_subject;
5622 USPTR start_partial = NULL;
5623 USPTR req_byte_ptr = start_match - 1;
5624 
5625 pcre_study_data internal_study;
5626 const pcre_study_data *study;
5627 
5628 real_pcre internal_re;
5629 const real_pcre *external_re = (const real_pcre *)argument_re;
5630 const real_pcre *re = external_re;
5631 
5632 /* Plausibility checks */
5633 
5634 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5635 if (re == NULL || subject == NULL ||
5636  (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5637 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5638 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5639 
5640 /* This information is for finding all the numbers associated with a given
5641 name, for condition testing. */
5642 
5643 md->name_table = (uschar *)re + re->name_table_offset;
5644 md->name_count = re->name_count;
5646 
5647 /* Fish out the optional data from the extra_data structure, first setting
5648 the default values. */
5649 
5650 study = NULL;
5651 md->match_limit = MATCH_LIMIT;
5653 md->callout_data = NULL;
5654 
5655 /* The table pointer is always in native byte order. */
5656 
5657 tables = external_re->tables;
5658 
5659 if (extra_data != NULL)
5660  {
5661  register unsigned int flags = extra_data->flags;
5662  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5663  study = (const pcre_study_data *)extra_data->study_data;
5664  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5665  md->match_limit = extra_data->match_limit;
5666  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5667  md->match_limit_recursion = extra_data->match_limit_recursion;
5668  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5669  md->callout_data = extra_data->callout_data;
5670  if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5671  }
5672 
5673 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5674 is a feature that makes it possible to save compiled regex and re-use them
5675 in other programs later. */
5676 
5677 if (tables == NULL) tables = _pcre_default_tables;
5678 
5679 /* Check that the first field in the block is the magic number. If it is not,
5680 test for a regex that was compiled on a host of opposite endianness. If this is
5681 the case, flipped values are put in internal_re and internal_study if there was
5682 study data too. */
5683 
5684 if (re->magic_number != MAGIC_NUMBER)
5685  {
5686  re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5687  if (re == NULL) return PCRE_ERROR_BADMAGIC;
5688  if (study != NULL) study = &internal_study;
5689  }
5690 
5691 /* Set up other data */
5692 
5693 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5694 startline = (re->flags & PCRE_STARTLINE) != 0;
5695 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5696 
5697 /* The code starts after the real_pcre block and the capture name table. */
5698 
5699 md->start_code = (const uschar *)external_re + re->name_table_offset +
5700  re->name_count * re->name_entry_size;
5701 
5702 md->start_subject = (USPTR)subject;
5703 md->start_offset = start_offset;
5704 md->end_subject = md->start_subject + length;
5705 end_subject = md->end_subject;
5706 
5707 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5708 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5709 md->use_ucp = (re->options & PCRE_UCP) != 0;
5710 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5711 
5712 md->notbol = (options & PCRE_NOTBOL) != 0;
5713 md->noteol = (options & PCRE_NOTEOL) != 0;
5714 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5715 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5716 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5717  ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5718 md->hitend = FALSE;
5719 md->mark = NULL; /* In case never set */
5720 
5721 md->recursive = NULL; /* No recursion at top level */
5722 
5723 md->lcc = tables + lcc_offset;
5724 md->ctypes = tables + ctypes_offset;
5725 
5726 /* Handle different \R options. */
5727 
5728 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5729  {
5730  case 0:
5731  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5732  md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5733  else
5734 #ifdef BSR_ANYCRLF
5735  md->bsr_anycrlf = TRUE;
5736 #else
5737  md->bsr_anycrlf = FALSE;
5738 #endif
5739  break;
5740 
5741  case PCRE_BSR_ANYCRLF:
5742  md->bsr_anycrlf = TRUE;
5743  break;
5744 
5745  case PCRE_BSR_UNICODE:
5746  md->bsr_anycrlf = FALSE;
5747  break;
5748 
5749  default: return PCRE_ERROR_BADNEWLINE;
5750  }
5751 
5752 /* Handle different types of newline. The three bits give eight cases. If
5753 nothing is set at run time, whatever was used at compile time applies. */
5754 
5755 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5756  (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5757  {
5758  case 0: newline = NEWLINE; break; /* Compile-time default */
5759  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5760  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5761  case PCRE_NEWLINE_CR+
5762  PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5763  case PCRE_NEWLINE_ANY: newline = -1; break;
5764  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5765  default: return PCRE_ERROR_BADNEWLINE;
5766  }
5767 
5768 if (newline == -2)
5769  {
5770  md->nltype = NLTYPE_ANYCRLF;
5771  }
5772 else if (newline < 0)
5773  {
5774  md->nltype = NLTYPE_ANY;
5775  }
5776 else
5777  {
5778  md->nltype = NLTYPE_FIXED;
5779  if (newline > 255)
5780  {
5781  md->nllen = 2;
5782  md->nl[0] = (newline >> 8) & 255;
5783  md->nl[1] = newline & 255;
5784  }
5785  else
5786  {
5787  md->nllen = 1;
5788  md->nl[0] = newline;
5789  }
5790  }
5791 
5792 /* Partial matching was originally supported only for a restricted set of
5793 regexes; from release 8.00 there are no restrictions, but the bits are still
5794 defined (though never set). So there's no harm in leaving this code. */
5795 
5796 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5797  return PCRE_ERROR_BADPARTIAL;
5798 
5799 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5800 back the character offset. */
5801 
5802 #ifdef SUPPORT_UTF8
5803 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5804  {
5805  int tb;
5806  if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
5807  return (tb == length && md->partial > 1)?
5809  if (start_offset > 0 && start_offset < length)
5810  {
5811  tb = ((USPTR)subject)[start_offset] & 0xc0;
5812  if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
5813  }
5814  }
5815 #endif
5816 
5817 /* The ims options can vary during the matching as a result of the presence
5818 of (?ims) items in the pattern. They are kept in a local variable so that
5819 restoring at the exit of a group is easy. */
5820 
5821 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5822 
5823 /* If the expression has got more back references than the offsets supplied can
5824 hold, we get a temporary chunk of working store to use during the matching.
5825 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5826 of 3. */
5827 
5828 ocount = offsetcount - (offsetcount % 3);
5829 
5830 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5831  {
5832  ocount = re->top_backref * 3 + 3;
5833  md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5834  if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5835  using_temporary_offsets = TRUE;
5836  DPRINTF(("Got memory to hold back references\n"));
5837  }
5838 else md->offset_vector = offsets;
5839 
5840 md->offset_end = ocount;
5841 md->offset_max = (2*ocount)/3;
5842 md->offset_overflow = FALSE;
5843 md->capture_last = -1;
5844 
5845 /* Compute the minimum number of offsets that we need to reset each time. Doing
5846 this makes a huge difference to execution time when there aren't many brackets
5847 in the pattern. */
5848 
5849 resetcount = 2 + re->top_bracket * 2;
5850 if (resetcount > offsetcount) resetcount = ocount;
5851 
5852 /* Reset the working variable associated with each extraction. These should
5853 never be used unless previously set, but they get saved and restored, and so we
5854 initialize them to avoid reading uninitialized locations. */
5855 
5856 if (md->offset_vector != NULL)
5857  {
5858  register int *iptr = md->offset_vector + ocount;
5859  register int *iend = iptr - resetcount/2 + 1;
5860  while (--iptr >= iend) *iptr = -1;
5861  }
5862 
5863 /* Set up the first character to match, if available. The first_byte value is
5864 never set for an anchored regular expression, but the anchoring may be forced
5865 at run time, so we have to test for anchoring. The first char may be unset for
5866 an unanchored pattern, of course. If there's no first char and the pattern was
5867 studied, there may be a bitmap of possible first characters. */
5868 
5869 if (!anchored)
5870  {
5871  if ((re->flags & PCRE_FIRSTSET) != 0)
5872  {
5873  first_byte = re->first_byte & 255;
5874  if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5875  first_byte = md->lcc[first_byte];
5876  }
5877  else
5878  if (!startline && study != NULL &&
5879  (study->flags & PCRE_STUDY_MAPPED) != 0)
5880  start_bits = study->start_bits;
5881  }
5882 
5883 /* For anchored or unanchored matches, there may be a "last known required
5884 character" set. */
5885 
5886 if ((re->flags & PCRE_REQCHSET) != 0)
5887  {
5888  req_byte = re->req_byte & 255;
5889  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5890  req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5891  }
5892 
5893 
5894 /* ==========================================================================*/
5895 
5896 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5897 the loop runs just once. */
5898 
5899 for(;;)
5900  {
5901  USPTR save_end_subject = end_subject;
5902  USPTR new_start_match;
5903 
5904  /* Reset the maximum number of extractions we might see. */
5905 
5906  if (md->offset_vector != NULL)
5907  {
5908  register int *iptr = md->offset_vector;
5909  register int *iend = iptr + resetcount;
5910  while (iptr < iend) *iptr++ = -1;
5911  }
5912 
5913  /* If firstline is TRUE, the start of the match is constrained to the first
5914  line of a multiline string. That is, the match must be before or at the first
5915  newline. Implement this by temporarily adjusting end_subject so that we stop
5916  scanning at a newline. If the match fails at the newline, later code breaks
5917  this loop. */
5918 
5919  if (firstline)
5920  {
5921  USPTR t = start_match;
5922 #ifdef SUPPORT_UTF8
5923  if (utf8)
5924  {
5925  while (t < md->end_subject && !IS_NEWLINE(t))
5926  {
5927  t++;
5928  while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5929  }
5930  }
5931  else
5932 #endif
5933  while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5934  end_subject = t;
5935  }
5936 
5937  /* There are some optimizations that avoid running the match if a known
5938  starting point is not found, or if a known later character is not present.
5939  However, there is an option that disables these, for testing and for ensuring
5940  that all callouts do actually occur. The option can be set in the regex by
5941  (*NO_START_OPT) or passed in match-time options. */
5942 
5943  if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
5944  {
5945  /* Advance to a unique first byte if there is one. */
5946 
5947  if (first_byte >= 0)
5948  {
5949  if (first_byte_caseless)
5950  while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5951  start_match++;
5952  else
5953  while (start_match < end_subject && *start_match != first_byte)
5954  start_match++;
5955  }
5956 
5957  /* Or to just after a linebreak for a multiline match */
5958 
5959  else if (startline)
5960  {
5961  if (start_match > md->start_subject + start_offset)
5962  {
5963 #ifdef SUPPORT_UTF8
5964  if (utf8)
5965  {
5966  while (start_match < end_subject && !WAS_NEWLINE(start_match))
5967  {
5968  start_match++;
5969  while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5970  start_match++;
5971  }
5972  }
5973  else
5974 #endif
5975  while (start_match < end_subject && !WAS_NEWLINE(start_match))
5976  start_match++;
5977 
5978  /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5979  and we are now at a LF, advance the match position by one more character.
5980  */
5981 
5982  if (start_match[-1] == CHAR_CR &&
5983  (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5984  start_match < end_subject &&
5985  *start_match == CHAR_NL)
5986  start_match++;
5987  }
5988  }
5989 
5990  /* Or to a non-unique first byte after study */
5991 
5992  else if (start_bits != NULL)
5993  {
5994  while (start_match < end_subject)
5995  {
5996  register unsigned int c = *start_match;
5997  if ((start_bits[c/8] & (1 << (c&7))) == 0)
5998  {
5999  start_match++;
6000 #ifdef SUPPORT_UTF8
6001  if (utf8)
6002  while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6003  start_match++;
6004 #endif
6005  }
6006  else break;
6007  }
6008  }
6009  } /* Starting optimizations */
6010 
6011  /* Restore fudged end_subject */
6012 
6013  end_subject = save_end_subject;
6014 
6015  /* The following two optimizations are disabled for partial matching or if
6016  disabling is explicitly requested. */
6017 
6018  if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6019  {
6020  /* If the pattern was studied, a minimum subject length may be set. This is
6021  a lower bound; no actual string of that length may actually match the
6022  pattern. Although the value is, strictly, in characters, we treat it as
6023  bytes to avoid spending too much time in this optimization. */
6024 
6025  if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6026  (pcre_uint32)(end_subject - start_match) < study->minlength)
6027  {
6028  rc = MATCH_NOMATCH;
6029  break;
6030  }
6031 
6032  /* If req_byte is set, we know that that character must appear in the
6033  subject for the match to succeed. If the first character is set, req_byte
6034  must be later in the subject; otherwise the test starts at the match point.
6035  This optimization can save a huge amount of backtracking in patterns with
6036  nested unlimited repeats that aren't going to match. Writing separate code
6037  for cased/caseless versions makes it go faster, as does using an
6038  autoincrement and backing off on a match.
6039 
6040  HOWEVER: when the subject string is very, very long, searching to its end
6041  can take a long time, and give bad performance on quite ordinary patterns.
6042  This showed up when somebody was matching something like /^\d+C/ on a
6043  32-megabyte string... so we don't do this when the string is sufficiently
6044  long. */
6045 
6046  if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6047  {
6048  register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6049 
6050  /* We don't need to repeat the search if we haven't yet reached the
6051  place we found it at last time. */
6052 
6053  if (p > req_byte_ptr)
6054  {
6055  if (req_byte_caseless)
6056  {
6057  while (p < end_subject)
6058  {
6059  register int pp = *p++;
6060  if (pp == req_byte || pp == req_byte2) { p--; break; }
6061  }
6062  }
6063  else
6064  {
6065  while (p < end_subject)
6066  {
6067  if (*p++ == req_byte) { p--; break; }
6068  }
6069  }
6070 
6071  /* If we can't find the required character, break the matching loop,
6072  forcing a match failure. */
6073 
6074  if (p >= end_subject)
6075  {
6076  rc = MATCH_NOMATCH;
6077  break;
6078  }
6079 
6080  /* If we have found the required character, save the point where we
6081  found it, so that we don't search again next time round the loop if
6082  the start hasn't passed this character yet. */
6083 
6084  req_byte_ptr = p;
6085  }
6086  }
6087  }
6088 
6089 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6090  printf(">>>> Match against: ");
6091  pchars(start_match, end_subject - start_match, TRUE, md);
6092  printf("\n");
6093 #endif
6094 
6095  /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6096  first starting point for which a partial match was found. */
6097 
6098  md->start_match_ptr = start_match;
6099  md->start_used_ptr = start_match;
6100  md->match_call_count = 0;
6101  rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6102  0, 0);
6103  if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6104 
6105  switch(rc)
6106  {
6107  /* SKIP passes back the next starting point explicitly, but if it is the
6108  same as the match we have just done, treat it as NOMATCH. */
6109 
6110  case MATCH_SKIP:
6111  if (md->start_match_ptr != start_match)
6112  {
6113  new_start_match = md->start_match_ptr;
6114  break;
6115  }
6116  /* Fall through */
6117 
6118  /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6119  the SKIP's arg was not found. We also treat this as NOMATCH. */
6120 
6121  case MATCH_SKIP_ARG:
6122  /* Fall through */
6123 
6124  /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6125  exactly like PRUNE. */
6126 
6127  case MATCH_NOMATCH:
6128  case MATCH_PRUNE:
6129  case MATCH_THEN:
6130  new_start_match = start_match + 1;
6131 #ifdef SUPPORT_UTF8
6132  if (utf8)
6133  while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6134  new_start_match++;
6135 #endif
6136  break;
6137 
6138  /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6139 
6140  case MATCH_COMMIT:
6141  rc = MATCH_NOMATCH;
6142  goto ENDLOOP;
6143 
6144  /* Any other return is either a match, or some kind of error. */
6145 
6146  default:
6147  goto ENDLOOP;
6148  }
6149 
6150  /* Control reaches here for the various types of "no match at this point"
6151  result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6152 
6153  rc = MATCH_NOMATCH;
6154 
6155  /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6156  newline in the subject (though it may continue over the newline). Therefore,
6157  if we have just failed to match, starting at a newline, do not continue. */
6158 
6159  if (firstline && IS_NEWLINE(start_match)) break;
6160 
6161  /* Advance to new matching position */
6162 
6163  start_match = new_start_match;
6164 
6165  /* Break the loop if the pattern is anchored or if we have passed the end of
6166  the subject. */
6167 
6168  if (anchored || start_match > end_subject) break;
6169 
6170  /* If we have just passed a CR and we are now at a LF, and the pattern does
6171  not contain any explicit matches for \r or \n, and the newline option is CRLF
6172  or ANY or ANYCRLF, advance the match position by one more character. */
6173 
6174  if (start_match[-1] == CHAR_CR &&
6175  start_match < end_subject &&
6176  *start_match == CHAR_NL &&
6177  (re->flags & PCRE_HASCRORLF) == 0 &&
6178  (md->nltype == NLTYPE_ANY ||
6179  md->nltype == NLTYPE_ANYCRLF ||
6180  md->nllen == 2))
6181  start_match++;
6182 
6183  md->mark = NULL; /* Reset for start of next match attempt */
6184  } /* End of for(;;) "bumpalong" loop */
6185 
6186 /* ==========================================================================*/
6187 
6188 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6189 conditions is true:
6190 
6191 (1) The pattern is anchored or the match was failed by (*COMMIT);
6192 
6193 (2) We are past the end of the subject;
6194 
6195 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6196  this option requests that a match occur at or before the first newline in
6197  the subject.
6198 
6199 When we have a match and the offset vector is big enough to deal with any
6200 backreferences, captured substring offsets will already be set up. In the case
6201 where we had to get some local store to hold offsets for backreference
6202 processing, copy those that we can. In this case there need not be overflow if
6203 certain parts of the pattern were not used, even though there are more
6204 capturing parentheses than vector slots. */
6205 
6206 ENDLOOP:
6207 
6208 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6209  {
6210  if (using_temporary_offsets)
6211  {
6212  if (offsetcount >= 4)
6213  {
6214  memcpy(offsets + 2, md->offset_vector + 2,
6215  (offsetcount - 2) * sizeof(int));
6216  DPRINTF(("Copied offsets from temporary memory\n"));
6217  }
6218  if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6219  DPRINTF(("Freeing temporary memory\n"));
6220  (pcre_free)(md->offset_vector);
6221  }
6222 
6223  /* Set the return code to the number of captured strings, or 0 if there are
6224  too many to fit into the vector. */
6225 
6226  rc = md->offset_overflow? 0 : md->end_offset_top/2;
6227 
6228  /* If there is space, set up the whole thing as substring 0. The value of
6229  md->start_match_ptr might be modified if \K was encountered on the success
6230  matching path. */
6231 
6232  if (offsetcount < 2) rc = 0; else
6233  {
6234  offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6235  offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6236  }
6237 
6238  DPRINTF((">>>> returning %d\n", rc));
6239  goto RETURN_MARK;
6240  }
6241 
6242 /* Control gets here if there has been an error, or if the overall match
6243 attempt has failed at all permitted starting positions. */
6244 
6245 if (using_temporary_offsets)
6246  {
6247  DPRINTF(("Freeing temporary memory\n"));
6248  (pcre_free)(md->offset_vector);
6249  }
6250 
6251 /* For anything other than nomatch or partial match, just return the code. */
6252 
6253 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6254  {
6255  DPRINTF((">>>> error: returning %d\n", rc));
6256  return rc;
6257  }
6258 
6259 /* Handle partial matches - disable any mark data */
6260 
6261 if (start_partial != NULL)
6262  {
6263  DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6264  md->mark = NULL;
6265  if (offsetcount > 1)
6266  {
6267  offsets[0] = (int)(start_partial - (USPTR)subject);
6268  offsets[1] = (int)(end_subject - (USPTR)subject);
6269  }
6270  rc = PCRE_ERROR_PARTIAL;
6271  }
6272 
6273 /* This is the classic nomatch case */
6274 
6275 else
6276  {
6277  DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6278  rc = PCRE_ERROR_NOMATCH;
6279  }
6280 
6281 /* Return the MARK data if it has been requested. */
6282 
6283 RETURN_MARK:
6284 
6285 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6286  *(extra_data->mark) = (unsigned char *)(md->mark);
6287 return rc;
6288 }
6289 
6290 /* End of pcre_exec.c */