Process Hacker
pcre_study.c
Go to the documentation of this file.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2010 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #define HAVE_CONFIG_H
46 #ifdef HAVE_CONFIG_H
47 #include "config.h"
48 #endif
49 
50 #include "pcre_internal.h"
51 
52 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
53 
54 /* Returns from set_start_bits() */
55 
57 
58 
59 
60 /*************************************************
61 * Find the minimum subject length for a group *
62 *************************************************/
63 
64 /* Scan a parenthesized group and compute the minimum length of subject that
65 is needed to match it. This is a lower bound; it does not mean there is a
66 string of that length that matches. In UTF8 mode, the result is in characters
67 rather than bytes.
68 
69 Arguments:
70  code pointer to start of group (the bracket)
71  startcode pointer to start of the whole pattern
72  options the compiling options
73 
74 Returns: the minimum length
75  -1 if \C was encountered
76  -2 internal error (missing capturing bracket)
77 */
78 
79 static int
80 find_minlength(const uschar *code, const uschar *startcode, int options)
81 {
82 int length = -1;
83 BOOL utf8 = (options & PCRE_UTF8) != 0;
84 BOOL had_recurse = FALSE;
85 register int branchlength = 0;
86 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87 
88 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
89 
90 /* Scan along the opcodes for this branch. If we get to the end of the
91 branch, check the length against that of the other branches. */
92 
93 for (;;)
94  {
95  int d, min;
96  uschar *cs, *ce;
97  register int op = *cc;
98 
99  switch (op)
100  {
101  case OP_COND:
102  case OP_SCOND:
103 
104  /* If there is only one branch in a condition, the implied branch has zero
105  length, so we don't add anything. This covers the DEFINE "condition"
106  automatically. */
107 
108  cs = cc + GET(cc, 1);
109  if (*cs != OP_ALT)
110  {
111  cc = cs + 1 + LINK_SIZE;
112  break;
113  }
114 
115  /* Otherwise we can fall through and treat it the same as any other
116  subpattern. */
117 
118  case OP_CBRA:
119  case OP_SCBRA:
120  case OP_BRA:
121  case OP_SBRA:
122  case OP_ONCE:
123  d = find_minlength(cc, startcode, options);
124  if (d < 0) return d;
125  branchlength += d;
126  do cc += GET(cc, 1); while (*cc == OP_ALT);
127  cc += 1 + LINK_SIZE;
128  break;
129 
130  /* Reached end of a branch; if it's a ket it is the end of a nested
131  call. If it's ALT it is an alternation in a nested call. If it is
132  END it's the end of the outer call. All can be handled by the same code. */
133 
134  case OP_ALT:
135  case OP_KET:
136  case OP_KETRMAX:
137  case OP_KETRMIN:
138  case OP_END:
139  if (length < 0 || (!had_recurse && branchlength < length))
140  length = branchlength;
141  if (*cc != OP_ALT) return length;
142  cc += 1 + LINK_SIZE;
143  branchlength = 0;
144  had_recurse = FALSE;
145  break;
146 
147  /* Skip over assertive subpatterns */
148 
149  case OP_ASSERT:
150  case OP_ASSERT_NOT:
151  case OP_ASSERTBACK:
152  case OP_ASSERTBACK_NOT:
153  do cc += GET(cc, 1); while (*cc == OP_ALT);
154  /* Fall through */
155 
156  /* Skip over things that don't match chars */
157 
158  case OP_REVERSE:
159  case OP_CREF:
160  case OP_NCREF:
161  case OP_RREF:
162  case OP_NRREF:
163  case OP_DEF:
164  case OP_OPT:
165  case OP_CALLOUT:
166  case OP_SOD:
167  case OP_SOM:
168  case OP_EOD:
169  case OP_EODN:
170  case OP_CIRC:
171  case OP_DOLL:
173  case OP_WORD_BOUNDARY:
174  cc += _pcre_OP_lengths[*cc];
175  break;
176 
177  /* Skip over a subpattern that has a {0} or {0,x} quantifier */
178 
179  case OP_BRAZERO:
180  case OP_BRAMINZERO:
181  case OP_SKIPZERO:
182  cc += _pcre_OP_lengths[*cc];
183  do cc += GET(cc, 1); while (*cc == OP_ALT);
184  cc += 1 + LINK_SIZE;
185  break;
186 
187  /* Handle literal characters and + repetitions */
188 
189  case OP_CHAR:
190  case OP_CHARNC:
191  case OP_NOT:
192  case OP_PLUS:
193  case OP_MINPLUS:
194  case OP_POSPLUS:
195  case OP_NOTPLUS:
196  case OP_NOTMINPLUS:
197  case OP_NOTPOSPLUS:
198  branchlength++;
199  cc += 2;
200 #ifdef SUPPORT_UTF8
201  if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
202 #endif
203  break;
204 
205  case OP_TYPEPLUS:
206  case OP_TYPEMINPLUS:
207  case OP_TYPEPOSPLUS:
208  branchlength++;
209  cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
210  break;
211 
212  /* Handle exact repetitions. The count is already in characters, but we
213  need to skip over a multibyte character in UTF8 mode. */
214 
215  case OP_EXACT:
216  case OP_NOTEXACT:
217  branchlength += GET2(cc,1);
218  cc += 4;
219 #ifdef SUPPORT_UTF8
220  if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
221 #endif
222  break;
223 
224  case OP_TYPEEXACT:
225  branchlength += GET2(cc,1);
226  cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
227  break;
228 
229  /* Handle single-char non-literal matchers */
230 
231  case OP_PROP:
232  case OP_NOTPROP:
233  cc += 2;
234  /* Fall through */
235 
236  case OP_NOT_DIGIT:
237  case OP_DIGIT:
238  case OP_NOT_WHITESPACE:
239  case OP_WHITESPACE:
240  case OP_NOT_WORDCHAR:
241  case OP_WORDCHAR:
242  case OP_ANY:
243  case OP_ALLANY:
244  case OP_EXTUNI:
245  case OP_HSPACE:
246  case OP_NOT_HSPACE:
247  case OP_VSPACE:
248  case OP_NOT_VSPACE:
249  branchlength++;
250  cc++;
251  break;
252 
253  /* "Any newline" might match two characters */
254 
255  case OP_ANYNL:
256  branchlength += 2;
257  cc++;
258  break;
259 
260  /* The single-byte matcher means we can't proceed in UTF-8 mode */
261 
262  case OP_ANYBYTE:
263 #ifdef SUPPORT_UTF8
264  if (utf8) return -1;
265 #endif
266  branchlength++;
267  cc++;
268  break;
269 
270  /* For repeated character types, we have to test for \p and \P, which have
271  an extra two bytes of parameters. */
272 
273  case OP_TYPESTAR:
274  case OP_TYPEMINSTAR:
275  case OP_TYPEQUERY:
276  case OP_TYPEMINQUERY:
277  case OP_TYPEPOSSTAR:
278  case OP_TYPEPOSQUERY:
279  if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
280  cc += _pcre_OP_lengths[op];
281  break;
282 
283  case OP_TYPEUPTO:
284  case OP_TYPEMINUPTO:
285  case OP_TYPEPOSUPTO:
286  if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
287  cc += _pcre_OP_lengths[op];
288  break;
289 
290  /* Check a class for variable quantification */
291 
292 #ifdef SUPPORT_UTF8
293  case OP_XCLASS:
294  cc += GET(cc, 1) - 33;
295  /* Fall through */
296 #endif
297 
298  case OP_CLASS:
299  case OP_NCLASS:
300  cc += 33;
301 
302  switch (*cc)
303  {
304  case OP_CRPLUS:
305  case OP_CRMINPLUS:
306  branchlength++;
307  /* Fall through */
308 
309  case OP_CRSTAR:
310  case OP_CRMINSTAR:
311  case OP_CRQUERY:
312  case OP_CRMINQUERY:
313  cc++;
314  break;
315 
316  case OP_CRRANGE:
317  case OP_CRMINRANGE:
318  branchlength += GET2(cc,1);
319  cc += 5;
320  break;
321 
322  default:
323  branchlength++;
324  break;
325  }
326  break;
327 
328  /* Backreferences and subroutine calls are treated in the same way: we find
329  the minimum length for the subpattern. A recursion, however, causes an
330  a flag to be set that causes the length of this branch to be ignored. The
331  logic is that a recursion can only make sense if there is another
332  alternation that stops the recursing. That will provide the minimum length
333  (when no recursion happens). A backreference within the group that it is
334  referencing behaves in the same way.
335 
336  If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
337  matches an empty string (by default it causes a matching failure), so in
338  that case we must set the minimum length to zero. */
339 
340  case OP_REF:
341  if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
342  {
343  ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
344  if (cs == NULL) return -2;
345  do ce += GET(ce, 1); while (*ce == OP_ALT);
346  if (cc > cs && cc < ce)
347  {
348  d = 0;
349  had_recurse = TRUE;
350  }
351  else d = find_minlength(cs, startcode, options);
352  }
353  else d = 0;
354  cc += 3;
355 
356  /* Handle repeated back references */
357 
358  switch (*cc)
359  {
360  case OP_CRSTAR:
361  case OP_CRMINSTAR:
362  case OP_CRQUERY:
363  case OP_CRMINQUERY:
364  min = 0;
365  cc++;
366  break;
367 
368  case OP_CRRANGE:
369  case OP_CRMINRANGE:
370  min = GET2(cc, 1);
371  cc += 5;
372  break;
373 
374  default:
375  min = 1;
376  break;
377  }
378 
379  branchlength += min * d;
380  break;
381 
382  case OP_RECURSE:
383  cs = ce = (uschar *)startcode + GET(cc, 1);
384  if (cs == NULL) return -2;
385  do ce += GET(ce, 1); while (*ce == OP_ALT);
386  if (cc > cs && cc < ce)
387  had_recurse = TRUE;
388  else
389  branchlength += find_minlength(cs, startcode, options);
390  cc += 1 + LINK_SIZE;
391  break;
392 
393  /* Anything else does not or need not match a character. We can get the
394  item's length from the table, but for those that can match zero occurrences
395  of a character, we must take special action for UTF-8 characters. */
396 
397  case OP_UPTO:
398  case OP_NOTUPTO:
399  case OP_MINUPTO:
400  case OP_NOTMINUPTO:
401  case OP_POSUPTO:
402  case OP_STAR:
403  case OP_MINSTAR:
404  case OP_NOTMINSTAR:
405  case OP_POSSTAR:
406  case OP_NOTPOSSTAR:
407  case OP_QUERY:
408  case OP_MINQUERY:
409  case OP_NOTMINQUERY:
410  case OP_POSQUERY:
411  case OP_NOTPOSQUERY:
412  cc += _pcre_OP_lengths[op];
413 #ifdef SUPPORT_UTF8
414  if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
415 #endif
416  break;
417 
418  /* Skip these, but we need to add in the name length. */
419 
420  case OP_MARK:
421  case OP_PRUNE_ARG:
422  case OP_SKIP_ARG:
423  cc += _pcre_OP_lengths[op] + cc[1];
424  break;
425 
426  case OP_THEN_ARG:
427  cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
428  break;
429 
430  /* For the record, these are the opcodes that are matched by "default":
431  OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
432  OP_THEN. */
433 
434  default:
435  cc += _pcre_OP_lengths[op];
436  break;
437  }
438  }
439 /* Control never gets here */
440 }
441 
442 
443 
444 /*************************************************
445 * Set a bit and maybe its alternate case *
446 *************************************************/
447 
448 /* Given a character, set its first byte's bit in the table, and also the
449 corresponding bit for the other version of a letter if we are caseless. In
450 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
451 when Unicode property support is available.
452 
453 Arguments:
454  start_bits points to the bit map
455  p points to the character
456  caseless the caseless flag
457  cd the block with char table pointers
458  utf8 TRUE for UTF-8 mode
459 
460 Returns: pointer after the character
461 */
462 
463 static const uschar *
464 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
465  compile_data *cd, BOOL utf8)
466 {
467 unsigned int c = *p;
468 
469 SET_BIT(c);
470 
471 #ifdef SUPPORT_UTF8
472 if (utf8 && c > 127)
473  {
474  GETCHARINC(c, p);
475 #ifdef SUPPORT_UCP
476  if (caseless)
477  {
478  uschar buff[8];
479  c = UCD_OTHERCASE(c);
480  (void)_pcre_ord2utf8(c, buff);
481  SET_BIT(buff[0]);
482  }
483 #endif
484  return p;
485  }
486 #endif
487 
488 /* Not UTF-8 mode, or character is less than 127. */
489 
490 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
491 return p + 1;
492 }
493 
494 
495 
496 /*************************************************
497 * Set bits for a positive character type *
498 *************************************************/
499 
500 /* This function sets starting bits for a character type. In UTF-8 mode, we can
501 only do a direct setting for bytes less than 128, as otherwise there can be
502 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
503 environment, the tables will only recognize ASCII characters anyway, but in at
504 least one Windows environment, some higher bytes bits were set in the tables.
505 So we deal with that case by considering the UTF-8 encoding.
506 
507 Arguments:
508  start_bits the starting bitmap
509  cbit type the type of character wanted
510  table_limit 32 for non-UTF-8; 16 for UTF-8
511  cd the block with char table pointers
512 
513 Returns: nothing
514 */
515 
516 static void
517 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
518  compile_data *cd)
519 {
520 register int c;
521 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
522 if (table_limit == 32) return;
523 for (c = 128; c < 256; c++)
524  {
525  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
526  {
527  uschar buff[8];
528  (void)_pcre_ord2utf8(c, buff);
529  SET_BIT(buff[0]);
530  }
531  }
532 }
533 
534 
535 /*************************************************
536 * Set bits for a negative character type *
537 *************************************************/
538 
539 /* This function sets starting bits for a negative character type such as \D.
540 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
541 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
542 Unlike in the positive case, where we can set appropriate starting bits for
543 specific high-valued UTF-8 characters, in this case we have to set the bits for
544 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
545 0xc0 (192) for simplicity.
546 
547 Arguments:
548  start_bits the starting bitmap
549  cbit type the type of character wanted
550  table_limit 32 for non-UTF-8; 16 for UTF-8
551  cd the block with char table pointers
552 
553 Returns: nothing
554 */
555 
556 static void
557 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
558  compile_data *cd)
559 {
560 register int c;
561 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
562 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
563 }
564 
565 
566 
567 /*************************************************
568 * Create bitmap of starting bytes *
569 *************************************************/
570 
571 /* This function scans a compiled unanchored expression recursively and
572 attempts to build a bitmap of the set of possible starting bytes. As time goes
573 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
574 useful for parenthesized groups in patterns such as (a*)b where the group
575 provides some optional starting bytes but scanning must continue at the outer
576 level to find at least one mandatory byte. At the outermost level, this
577 function fails unless the result is SSB_DONE.
578 
579 Arguments:
580  code points to an expression
581  start_bits points to a 32-byte table, initialized to 0
582  caseless the current state of the caseless flag
583  utf8 TRUE if in UTF-8 mode
584  cd the block with char table pointers
585 
586 Returns: SSB_FAIL => Failed to find any starting bytes
587  SSB_DONE => Found mandatory starting bytes
588  SSB_CONTINUE => Found optional starting bytes
589 */
590 
591 static int
592 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
593  BOOL utf8, compile_data *cd)
594 {
595 register int c;
596 int yield = SSB_DONE;
597 int table_limit = utf8? 16:32;
598 
599 #if 0
600 /* ========================================================================= */
601 /* The following comment and code was inserted in January 1999. In May 2006,
602 when it was observed to cause compiler warnings about unused values, I took it
603 out again. If anybody is still using OS/2, they will have to put it back
604 manually. */
605 
606 /* This next statement and the later reference to dummy are here in order to
607 trick the optimizer of the IBM C compiler for OS/2 into generating correct
608 code. Apparently IBM isn't going to fix the problem, and we would rather not
609 disable optimization (in this module it actually makes a big difference, and
610 the pcre module can use all the optimization it can get). */
611 
612 volatile int dummy;
613 /* ========================================================================= */
614 #endif
615 
616 do
617  {
618  const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
619  BOOL try_next = TRUE;
620 
621  while (try_next) /* Loop for items in this branch */
622  {
623  int rc;
624  switch(*tcode)
625  {
626  /* Fail if we reach something we don't understand */
627 
628  default:
629  return SSB_FAIL;
630 
631  /* If we hit a bracket or a positive lookahead assertion, recurse to set
632  bits from within the subpattern. If it can't find anything, we have to
633  give up. If it finds some mandatory character(s), we are done for this
634  branch. Otherwise, carry on scanning after the subpattern. */
635 
636  case OP_BRA:
637  case OP_SBRA:
638  case OP_CBRA:
639  case OP_SCBRA:
640  case OP_ONCE:
641  case OP_ASSERT:
642  rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
643  if (rc == SSB_FAIL) return SSB_FAIL;
644  if (rc == SSB_DONE) try_next = FALSE; else
645  {
646  do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
647  tcode += 1 + LINK_SIZE;
648  }
649  break;
650 
651  /* If we hit ALT or KET, it means we haven't found anything mandatory in
652  this branch, though we might have found something optional. For ALT, we
653  continue with the next alternative, but we have to arrange that the final
654  result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
655  return SSB_CONTINUE: if this is the top level, that indicates failure,
656  but after a nested subpattern, it causes scanning to continue. */
657 
658  case OP_ALT:
659  yield = SSB_CONTINUE;
660  try_next = FALSE;
661  break;
662 
663  case OP_KET:
664  case OP_KETRMAX:
665  case OP_KETRMIN:
666  return SSB_CONTINUE;
667 
668  /* Skip over callout */
669 
670  case OP_CALLOUT:
671  tcode += 2 + 2*LINK_SIZE;
672  break;
673 
674  /* Skip over lookbehind and negative lookahead assertions */
675 
676  case OP_ASSERT_NOT:
677  case OP_ASSERTBACK:
678  case OP_ASSERTBACK_NOT:
679  do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
680  tcode += 1 + LINK_SIZE;
681  break;
682 
683  /* Skip over an option setting, changing the caseless flag */
684 
685  case OP_OPT:
686  caseless = (tcode[1] & PCRE_CASELESS) != 0;
687  tcode += 2;
688  break;
689 
690  /* BRAZERO does the bracket, but carries on. */
691 
692  case OP_BRAZERO:
693  case OP_BRAMINZERO:
694  if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
695  return SSB_FAIL;
696 /* =========================================================================
697  See the comment at the head of this function concerning the next line,
698  which was an old fudge for the benefit of OS/2.
699  dummy = 1;
700  ========================================================================= */
701  do tcode += GET(tcode,1); while (*tcode == OP_ALT);
702  tcode += 1 + LINK_SIZE;
703  break;
704 
705  /* SKIPZERO skips the bracket. */
706 
707  case OP_SKIPZERO:
708  tcode++;
709  do tcode += GET(tcode,1); while (*tcode == OP_ALT);
710  tcode += 1 + LINK_SIZE;
711  break;
712 
713  /* Single-char * or ? sets the bit and tries the next item */
714 
715  case OP_STAR:
716  case OP_MINSTAR:
717  case OP_POSSTAR:
718  case OP_QUERY:
719  case OP_MINQUERY:
720  case OP_POSQUERY:
721  tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
722  break;
723 
724  /* Single-char upto sets the bit and tries the next */
725 
726  case OP_UPTO:
727  case OP_MINUPTO:
728  case OP_POSUPTO:
729  tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
730  break;
731 
732  /* At least one single char sets the bit and stops */
733 
734  case OP_EXACT: /* Fall through */
735  tcode += 2;
736 
737  case OP_CHAR:
738  case OP_CHARNC:
739  case OP_PLUS:
740  case OP_MINPLUS:
741  case OP_POSPLUS:
742  (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
743  try_next = FALSE;
744  break;
745 
746  /* Special spacing and line-terminating items. These recognize specific
747  lists of characters. The difference between VSPACE and ANYNL is that the
748  latter can match the two-character CRLF sequence, but that is not
749  relevant for finding the first character, so their code here is
750  identical. */
751 
752  case OP_HSPACE:
753  SET_BIT(0x09);
754  SET_BIT(0x20);
755  if (utf8)
756  {
757  SET_BIT(0xC2); /* For U+00A0 */
758  SET_BIT(0xE1); /* For U+1680, U+180E */
759  SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
760  SET_BIT(0xE3); /* For U+3000 */
761  }
762  else SET_BIT(0xA0);
763  try_next = FALSE;
764  break;
765 
766  case OP_ANYNL:
767  case OP_VSPACE:
768  SET_BIT(0x0A);
769  SET_BIT(0x0B);
770  SET_BIT(0x0C);
771  SET_BIT(0x0D);
772  if (utf8)
773  {
774  SET_BIT(0xC2); /* For U+0085 */
775  SET_BIT(0xE2); /* For U+2028, U+2029 */
776  }
777  else SET_BIT(0x85);
778  try_next = FALSE;
779  break;
780 
781  /* Single character types set the bits and stop. Note that if PCRE_UCP
782  is set, we do not see these op codes because \d etc are converted to
783  properties. Therefore, these apply in the case when only characters less
784  than 256 are recognized to match the types. */
785 
786  case OP_NOT_DIGIT:
787  set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
788  try_next = FALSE;
789  break;
790 
791  case OP_DIGIT:
792  set_type_bits(start_bits, cbit_digit, table_limit, cd);
793  try_next = FALSE;
794  break;
795 
796  /* The cbit_space table has vertical tab as whitespace; we have to
797  ensure it is set as not whitespace. */
798 
799  case OP_NOT_WHITESPACE:
800  set_nottype_bits(start_bits, cbit_space, table_limit, cd);
801  start_bits[1] |= 0x08;
802  try_next = FALSE;
803  break;
804 
805  /* The cbit_space table has vertical tab as whitespace; we have to
806  not set it from the table. */
807 
808  case OP_WHITESPACE:
809  c = start_bits[1]; /* Save in case it was already set */
810  set_type_bits(start_bits, cbit_space, table_limit, cd);
811  start_bits[1] = (start_bits[1] & ~0x08) | c;
812  try_next = FALSE;
813  break;
814 
815  case OP_NOT_WORDCHAR:
816  set_nottype_bits(start_bits, cbit_word, table_limit, cd);
817  try_next = FALSE;
818  break;
819 
820  case OP_WORDCHAR:
821  set_type_bits(start_bits, cbit_word, table_limit, cd);
822  try_next = FALSE;
823  break;
824 
825  /* One or more character type fudges the pointer and restarts, knowing
826  it will hit a single character type and stop there. */
827 
828  case OP_TYPEPLUS:
829  case OP_TYPEMINPLUS:
830  case OP_TYPEPOSPLUS:
831  tcode++;
832  break;
833 
834  case OP_TYPEEXACT:
835  tcode += 3;
836  break;
837 
838  /* Zero or more repeats of character types set the bits and then
839  try again. */
840 
841  case OP_TYPEUPTO:
842  case OP_TYPEMINUPTO:
843  case OP_TYPEPOSUPTO:
844  tcode += 2; /* Fall through */
845 
846  case OP_TYPESTAR:
847  case OP_TYPEMINSTAR:
848  case OP_TYPEPOSSTAR:
849  case OP_TYPEQUERY:
850  case OP_TYPEMINQUERY:
851  case OP_TYPEPOSQUERY:
852  switch(tcode[1])
853  {
854  default:
855  case OP_ANY:
856  case OP_ALLANY:
857  return SSB_FAIL;
858 
859  case OP_HSPACE:
860  SET_BIT(0x09);
861  SET_BIT(0x20);
862  if (utf8)
863  {
864  SET_BIT(0xC2); /* For U+00A0 */
865  SET_BIT(0xE1); /* For U+1680, U+180E */
866  SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
867  SET_BIT(0xE3); /* For U+3000 */
868  }
869  else SET_BIT(0xA0);
870  break;
871 
872  case OP_ANYNL:
873  case OP_VSPACE:
874  SET_BIT(0x0A);
875  SET_BIT(0x0B);
876  SET_BIT(0x0C);
877  SET_BIT(0x0D);
878  if (utf8)
879  {
880  SET_BIT(0xC2); /* For U+0085 */
881  SET_BIT(0xE2); /* For U+2028, U+2029 */
882  }
883  else SET_BIT(0x85);
884  break;
885 
886  case OP_NOT_DIGIT:
887  set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
888  break;
889 
890  case OP_DIGIT:
891  set_type_bits(start_bits, cbit_digit, table_limit, cd);
892  break;
893 
894  /* The cbit_space table has vertical tab as whitespace; we have to
895  ensure it gets set as not whitespace. */
896 
897  case OP_NOT_WHITESPACE:
898  set_nottype_bits(start_bits, cbit_space, table_limit, cd);
899  start_bits[1] |= 0x08;
900  break;
901 
902  /* The cbit_space table has vertical tab as whitespace; we have to
903  avoid setting it. */
904 
905  case OP_WHITESPACE:
906  c = start_bits[1]; /* Save in case it was already set */
907  set_type_bits(start_bits, cbit_space, table_limit, cd);
908  start_bits[1] = (start_bits[1] & ~0x08) | c;
909  break;
910 
911  case OP_NOT_WORDCHAR:
912  set_nottype_bits(start_bits, cbit_word, table_limit, cd);
913  break;
914 
915  case OP_WORDCHAR:
916  set_type_bits(start_bits, cbit_word, table_limit, cd);
917  break;
918  }
919 
920  tcode += 2;
921  break;
922 
923  /* Character class where all the information is in a bit map: set the
924  bits and either carry on or not, according to the repeat count. If it was
925  a negative class, and we are operating with UTF-8 characters, any byte
926  with a value >= 0xc4 is a potentially valid starter because it starts a
927  character with a value > 255. */
928 
929  case OP_NCLASS:
930 #ifdef SUPPORT_UTF8
931  if (utf8)
932  {
933  start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
934  memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
935  }
936 #endif
937  /* Fall through */
938 
939  case OP_CLASS:
940  {
941  tcode++;
942 
943  /* In UTF-8 mode, the bits in a bit map correspond to character
944  values, not to byte values. However, the bit map we are constructing is
945  for byte values. So we have to do a conversion for characters whose
946  value is > 127. In fact, there are only two possible starting bytes for
947  characters in the range 128 - 255. */
948 
949 #ifdef SUPPORT_UTF8
950  if (utf8)
951  {
952  for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
953  for (c = 128; c < 256; c++)
954  {
955  if ((tcode[c/8] && (1 << (c&7))) != 0)
956  {
957  int d = (c >> 6) | 0xc0; /* Set bit for this starter */
958  start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
959  c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
960  }
961  }
962  }
963 
964  /* In non-UTF-8 mode, the two bit maps are completely compatible. */
965 
966  else
967 #endif
968  {
969  for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
970  }
971 
972  /* Advance past the bit map, and act on what follows */
973 
974  tcode += 32;
975  switch (*tcode)
976  {
977  case OP_CRSTAR:
978  case OP_CRMINSTAR:
979  case OP_CRQUERY:
980  case OP_CRMINQUERY:
981  tcode++;
982  break;
983 
984  case OP_CRRANGE:
985  case OP_CRMINRANGE:
986  if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
987  else try_next = FALSE;
988  break;
989 
990  default:
991  try_next = FALSE;
992  break;
993  }
994  }
995  break; /* End of bitmap class handling */
996 
997  } /* End of switch */
998  } /* End of try_next loop */
999 
1000  code += GET(code, 1); /* Advance to next branch */
1001  }
1002 while (*code == OP_ALT);
1003 return yield;
1004 }
1005 
1006 
1007 
1008 /*************************************************
1009 * Study a compiled expression *
1010 *************************************************/
1011 
1012 /* This function is handed a compiled expression that it must study to produce
1013 information that will speed up the matching. It returns a pcre_extra block
1014 which then gets handed back to pcre_exec().
1015 
1016 Arguments:
1017  re points to the compiled expression
1018  options contains option bits
1019  errorptr points to where to place error messages;
1020  set NULL unless error
1021 
1022 Returns: pointer to a pcre_extra block, with study_data filled in and the
1023  appropriate flags set;
1024  NULL on error or if no optimization possible
1025 */
1026 
1028 pcre_study(const pcre *external_re, int options, const char **errorptr)
1029 {
1030 int min;
1031 BOOL bits_set = FALSE;
1032 uschar start_bits[32];
1033 pcre_extra *extra;
1034 pcre_study_data *study;
1035 const uschar *tables;
1036 uschar *code;
1037 compile_data compile_block;
1038 const real_pcre *re = (const real_pcre *)external_re;
1039 
1040 *errorptr = NULL;
1041 
1042 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1043  {
1044  *errorptr = "argument is not a compiled regular expression";
1045  return NULL;
1046  }
1047 
1048 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1049  {
1050  *errorptr = "unknown or incorrect option bit(s) set";
1051  return NULL;
1052  }
1053 
1054 code = (uschar *)re + re->name_table_offset +
1055  (re->name_count * re->name_entry_size);
1056 
1057 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1058 a multiline pattern that matches only at "line starts", there is no point in
1059 seeking a list of starting bytes. */
1060 
1061 if ((re->options & PCRE_ANCHORED) == 0 &&
1062  (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1063  {
1064  /* Set the character tables in the block that is passed around */
1065 
1066  tables = re->tables;
1067  if (tables == NULL)
1068  (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1069  (void *)(&tables));
1070 
1071  compile_block.lcc = tables + lcc_offset;
1072  compile_block.fcc = tables + fcc_offset;
1073  compile_block.cbits = tables + cbits_offset;
1074  compile_block.ctypes = tables + ctypes_offset;
1075 
1076  /* See if we can find a fixed set of initial characters for the pattern. */
1077 
1078  memset(start_bits, 0, 32 * sizeof(uschar));
1079  bits_set = set_start_bits(code, start_bits,
1080  (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1081  &compile_block) == SSB_DONE;
1082  }
1083 
1084 /* Find the minimum length of subject string. */
1085 
1086 min = find_minlength(code, code, re->options);
1087 
1088 /* Return NULL if no optimization is possible. */
1089 
1090 if (!bits_set && min < 0) return NULL;
1091 
1092 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1093 the latter, which is pointed to by the former, which may also get additional
1094 data set later by the calling program. At the moment, the size of
1095 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1096 the pcre_fullinfo() function so that if it becomes variable in the future, we
1097 don't have to change that code. */
1098 
1099 extra = (pcre_extra *)(pcre_malloc)
1100  (sizeof(pcre_extra) + sizeof(pcre_study_data));
1101 
1102 if (extra == NULL)
1103  {
1104  *errorptr = "failed to get memory";
1105  return NULL;
1106  }
1107 
1108 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1109 extra->flags = PCRE_EXTRA_STUDY_DATA;
1110 extra->study_data = study;
1111 
1112 study->size = sizeof(pcre_study_data);
1113 study->flags = 0;
1114 
1115 if (bits_set)
1116  {
1117  study->flags |= PCRE_STUDY_MAPPED;
1118  memcpy(study->start_bits, start_bits, sizeof(start_bits));
1119  }
1120 
1121 if (min >= 0)
1122  {
1123  study->flags |= PCRE_STUDY_MINLEN;
1124  study->minlength = min;
1125  }
1126 
1127 return extra;
1128 }
1129 
1130 /* End of pcre_study.c */