/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 164 by ph10, Fri May 4 15:11:14 2007 UTC revision 190 by ph10, Thu Jul 19 10:38:20 2007 UTC
# Line 189  obtained from malloc() instead instead o Line 189  obtained from malloc() instead instead o
189  achieve this so that the actual code doesn't look very different to what it  achieve this so that the actual code doesn't look very different to what it
190  always used to.  always used to.
191    
192  The original heap-recursive code used longjmp(). However, it seems that this  The original heap-recursive code used longjmp(). However, it seems that this
193  can be very slow on some operating systems. Following a suggestion from Stan  can be very slow on some operating systems. Following a suggestion from Stan
194  Switzer, the use of longjmp() has been abolished, at the cost of having to  Switzer, the use of longjmp() has been abolished, at the cost of having to
195  provide a unique number for each call to RMATCH. There is no way of generating  provide a unique number for each call to RMATCH. There is no way of generating
# Line 198  them stand out more clearly. Line 198  them stand out more clearly.
198    
199  Crude tests on x86 Linux show a small speedup of around 5-8%. However, on  Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200  FreeBSD, avoiding longjmp() more than halves the time taken to run the standard  FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201  tests. Furthermore, not using longjmp() means that local dynamic variables  tests. Furthermore, not using longjmp() means that local dynamic variables
202  don't have indeterminate values; this has meant that the frame size can be  don't have indeterminate values; this has meant that the frame size can be
203  reduced because the result can be "passed back" by straight setting of the  reduced because the result can be "passed back" by straight setting of the
204  variable instead of being passed in the frame.  variable instead of being passed in the frame.
205  ****************************************************************************  ****************************************************************************
# Line 213  enum { RM1=1, RM2, RM3, RM4, RM5, RM Line 213  enum { RM1=1, RM2, RM3, RM4, RM5, RM
213         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,         RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,         RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215         RM41,  RM42, RM43, RM44, RM45, RM46, RM47 };         RM41,  RM42, RM43, RM44, RM45, RM46, RM47 };
216    
217    
218  /* These versions of the macros use the stack, as normal. There are debugging  /* These versions of the macros use the stack, as normal. There are debugging
219  versions and production versions. Note that the "rw" argument of RMATCH isn't  versions and production versions. Note that the "rw" argument of RMATCH isn't
220  actuall used in this definition. */  actuall used in this definition. */
221    
222  #ifndef NO_RECURSE  #ifndef NO_RECURSE
# Line 226  actuall used in this definition. */ Line 226  actuall used in this definition. */
226  #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \  #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227    { \    { \
228    printf("match() called in line %d\n", __LINE__); \    printf("match() called in line %d\n", __LINE__); \
229    rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \    rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
230    printf("to line %d\n", __LINE__); \    printf("to line %d\n", __LINE__); \
231    }    }
232  #define RRETURN(ra) \  #define RRETURN(ra) \
# Line 236  actuall used in this definition. */ Line 236  actuall used in this definition. */
236    }    }
237  #else  #else
238  #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \  #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239    rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)    rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
240  #define RRETURN(ra) return ra  #define RRETURN(ra) return ra
241  #endif  #endif
242    
# Line 255  argument of match(), which never changes Line 255  argument of match(), which never changes
255    frame->Xwhere = rw; \    frame->Xwhere = rw; \
256    newframe->Xeptr = ra;\    newframe->Xeptr = ra;\
257    newframe->Xecode = rb;\    newframe->Xecode = rb;\
258      newframe->Xmstart = mstart;\
259    newframe->Xoffset_top = rc;\    newframe->Xoffset_top = rc;\
260    newframe->Xims = re;\    newframe->Xims = re;\
261    newframe->Xeptrb = rf;\    newframe->Xeptrb = rf;\
# Line 291  typedef struct heapframe { Line 292  typedef struct heapframe {
292    
293    const uschar *Xeptr;    const uschar *Xeptr;
294    const uschar *Xecode;    const uschar *Xecode;
295      const uschar *Xmstart;
296    int Xoffset_top;    int Xoffset_top;
297    long int Xims;    long int Xims;
298    eptrblock *Xeptrb;    eptrblock *Xeptrb;
# Line 344  typedef struct heapframe { Line 346  typedef struct heapframe {
346    /* Where to jump back to */    /* Where to jump back to */
347    
348    int Xwhere;    int Xwhere;
349    
350  } heapframe;  } heapframe;
351    
352  #endif  #endif
# Line 371  made performance worse. Line 373  made performance worse.
373  Arguments:  Arguments:
374     eptr        pointer to current character in subject     eptr        pointer to current character in subject
375     ecode       pointer to current position in compiled code     ecode       pointer to current position in compiled code
376       mstart      pointer to the current match start position (can be modified
377                     by encountering \K)
378     offset_top  current top pointer     offset_top  current top pointer
379     md          pointer to "static" info for the match     md          pointer to "static" info for the match
380     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
# Line 390  Returns: MATCH_MATCH if matched Line 394  Returns: MATCH_MATCH if matched
394  */  */
395    
396  static int  static int
397  match(REGISTER USPTR eptr, REGISTER const uschar *ecode,  match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
398    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,    int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
399    int flags, unsigned int rdepth)    int flags, unsigned int rdepth)
400  {  {
# Line 418  frame->Xprevframe = NULL; /* Line 422  frame->Xprevframe = NULL; /*
422    
423  frame->Xeptr = eptr;  frame->Xeptr = eptr;
424  frame->Xecode = ecode;  frame->Xecode = ecode;
425    frame->Xmstart = mstart;
426  frame->Xoffset_top = offset_top;  frame->Xoffset_top = offset_top;
427  frame->Xims = ims;  frame->Xims = ims;
428  frame->Xeptrb = eptrb;  frame->Xeptrb = eptrb;
# Line 432  HEAP_RECURSE: Line 437  HEAP_RECURSE:
437    
438  #define eptr               frame->Xeptr  #define eptr               frame->Xeptr
439  #define ecode              frame->Xecode  #define ecode              frame->Xecode
440    #define mstart             frame->Xmstart
441  #define offset_top         frame->Xoffset_top  #define offset_top         frame->Xoffset_top
442  #define ims                frame->Xims  #define ims                frame->Xims
443  #define eptrb              frame->Xeptrb  #define eptrb              frame->Xeptrb
# Line 610  for (;;) Line 616  for (;;)
616    
617    if (md->partial &&    if (md->partial &&
618        eptr >= md->end_subject &&        eptr >= md->end_subject &&
619        eptr > md->start_match)        eptr > mstart)
620      md->hitend = TRUE;      md->hitend = TRUE;
621    
622    switch(op)    switch(op)
# Line 787  for (;;) Line 793  for (;;)
793        md->recursive = rec->prevrec;        md->recursive = rec->prevrec;
794        memmove(md->offset_vector, rec->offset_save,        memmove(md->offset_vector, rec->offset_save,
795          rec->saved_max * sizeof(int));          rec->saved_max * sizeof(int));
796        md->start_match = rec->save_start;        mstart = rec->save_start;
797        ims = original_ims;        ims = original_ims;
798        ecode = rec->after_call;        ecode = rec->after_call;
799        break;        break;
# Line 796  for (;;) Line 802  for (;;)
802      /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty      /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
803      string - backtracking will then try other alternatives, if any. */      string - backtracking will then try other alternatives, if any. */
804    
805      if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);      if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
806      md->end_match_ptr = eptr;          /* Record where we ended */      md->end_match_ptr = eptr;           /* Record where we ended */
807      md->end_offset_top = offset_top;   /* and how many extracts were taken */      md->end_offset_top = offset_top;    /* and how many extracts were taken */
808        md->start_match_ptr = mstart;  /* and the start (\K can modify) */
809      RRETURN(MATCH_MATCH);      RRETURN(MATCH_MATCH);
810    
811      /* Change option settings */      /* Change option settings */
# Line 904  for (;;) Line 911  for (;;)
911        cb.offset_vector    = md->offset_vector;        cb.offset_vector    = md->offset_vector;
912        cb.subject          = (PCRE_SPTR)md->start_subject;        cb.subject          = (PCRE_SPTR)md->start_subject;
913        cb.subject_length   = md->end_subject - md->start_subject;        cb.subject_length   = md->end_subject - md->start_subject;
914        cb.start_match      = md->start_match - md->start_subject;        cb.start_match      = mstart - md->start_subject;
915        cb.current_position = eptr - md->start_subject;        cb.current_position = eptr - md->start_subject;
916        cb.pattern_position = GET(ecode, 2);        cb.pattern_position = GET(ecode, 2);
917        cb.next_item_length = GET(ecode, 2 + LINK_SIZE);        cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
# Line 966  for (;;) Line 973  for (;;)
973    
974        memcpy(new_recursive.offset_save, md->offset_vector,        memcpy(new_recursive.offset_save, md->offset_vector,
975              new_recursive.saved_max * sizeof(int));              new_recursive.saved_max * sizeof(int));
976        new_recursive.save_start = md->start_match;        new_recursive.save_start = mstart;
977        md->start_match = eptr;        mstart = eptr;
978    
979        /* OK, now we can do the recursion. For each top-level alternative we        /* OK, now we can do the recursion. For each top-level alternative we
980        restore the offset and recursion data. */        restore the offset and recursion data. */
# Line 1180  for (;;) Line 1187  for (;;)
1187          recursion_info *rec = md->recursive;          recursion_info *rec = md->recursive;
1188          DPRINTF(("Recursion (%d) succeeded - continuing\n", number));          DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1189          md->recursive = rec->prevrec;          md->recursive = rec->prevrec;
1190          md->start_match = rec->save_start;          mstart = rec->save_start;
1191          memcpy(md->offset_vector, rec->offset_save,          memcpy(md->offset_vector, rec->offset_save,
1192            rec->saved_max * sizeof(int));            rec->saved_max * sizeof(int));
1193          ecode = rec->after_call;          ecode = rec->after_call;
# Line 1260  for (;;) Line 1267  for (;;)
1267      ecode++;      ecode++;
1268      break;      break;
1269    
1270        /* Reset the start of match point */
1271    
1272        case OP_SET_SOM:
1273        mstart = eptr;
1274        ecode++;
1275        break;
1276    
1277      /* Assert before internal newline if multiline, or before a terminating      /* Assert before internal newline if multiline, or before a terminating
1278      newline unless endonly is set, else end of subject unless noteol is set. */      newline unless endonly is set, else end of subject unless noteol is set. */
1279    
# Line 1468  for (;;) Line 1482  for (;;)
1482      ecode++;      ecode++;
1483      break;      break;
1484    
1485        case OP_NOT_HSPACE:
1486        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1487        GETCHARINCTEST(c, eptr);
1488        switch(c)
1489          {
1490          default: break;
1491          case 0x09:      /* HT */
1492          case 0x20:      /* SPACE */
1493          case 0xa0:      /* NBSP */
1494          case 0x1680:    /* OGHAM SPACE MARK */
1495          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1496          case 0x2000:    /* EN QUAD */
1497          case 0x2001:    /* EM QUAD */
1498          case 0x2002:    /* EN SPACE */
1499          case 0x2003:    /* EM SPACE */
1500          case 0x2004:    /* THREE-PER-EM SPACE */
1501          case 0x2005:    /* FOUR-PER-EM SPACE */
1502          case 0x2006:    /* SIX-PER-EM SPACE */
1503          case 0x2007:    /* FIGURE SPACE */
1504          case 0x2008:    /* PUNCTUATION SPACE */
1505          case 0x2009:    /* THIN SPACE */
1506          case 0x200A:    /* HAIR SPACE */
1507          case 0x202f:    /* NARROW NO-BREAK SPACE */
1508          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1509          case 0x3000:    /* IDEOGRAPHIC SPACE */
1510          RRETURN(MATCH_NOMATCH);
1511          }
1512        ecode++;
1513        break;
1514    
1515        case OP_HSPACE:
1516        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517        GETCHARINCTEST(c, eptr);
1518        switch(c)
1519          {
1520          default: RRETURN(MATCH_NOMATCH);
1521          case 0x09:      /* HT */
1522          case 0x20:      /* SPACE */
1523          case 0xa0:      /* NBSP */
1524          case 0x1680:    /* OGHAM SPACE MARK */
1525          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1526          case 0x2000:    /* EN QUAD */
1527          case 0x2001:    /* EM QUAD */
1528          case 0x2002:    /* EN SPACE */
1529          case 0x2003:    /* EM SPACE */
1530          case 0x2004:    /* THREE-PER-EM SPACE */
1531          case 0x2005:    /* FOUR-PER-EM SPACE */
1532          case 0x2006:    /* SIX-PER-EM SPACE */
1533          case 0x2007:    /* FIGURE SPACE */
1534          case 0x2008:    /* PUNCTUATION SPACE */
1535          case 0x2009:    /* THIN SPACE */
1536          case 0x200A:    /* HAIR SPACE */
1537          case 0x202f:    /* NARROW NO-BREAK SPACE */
1538          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1539          case 0x3000:    /* IDEOGRAPHIC SPACE */
1540          break;
1541          }
1542        ecode++;
1543        break;
1544    
1545        case OP_NOT_VSPACE:
1546        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1547        GETCHARINCTEST(c, eptr);
1548        switch(c)
1549          {
1550          default: break;
1551          case 0x0a:      /* LF */
1552          case 0x0b:      /* VT */
1553          case 0x0c:      /* FF */
1554          case 0x0d:      /* CR */
1555          case 0x85:      /* NEL */
1556          case 0x2028:    /* LINE SEPARATOR */
1557          case 0x2029:    /* PARAGRAPH SEPARATOR */
1558          RRETURN(MATCH_NOMATCH);
1559          }
1560        ecode++;
1561        break;
1562    
1563        case OP_VSPACE:
1564        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1565        GETCHARINCTEST(c, eptr);
1566        switch(c)
1567          {
1568          default: RRETURN(MATCH_NOMATCH);
1569          case 0x0a:      /* LF */
1570          case 0x0b:      /* VT */
1571          case 0x0c:      /* FF */
1572          case 0x0d:      /* CR */
1573          case 0x85:      /* NEL */
1574          case 0x2028:    /* LINE SEPARATOR */
1575          case 0x2029:    /* PARAGRAPH SEPARATOR */
1576          break;
1577          }
1578        ecode++;
1579        break;
1580    
1581  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1582      /* Check the next character by Unicode property. We will get here only      /* Check the next character by Unicode property. We will get here only
1583      if the support is in the binary; otherwise a compile-time error occurs. */      if the support is in the binary; otherwise a compile-time error occurs. */
# Line 2676  for (;;) Line 2786  for (;;)
2786            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2787              {              {
2788              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2789              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2790              }              }
2791            break;            break;
2792    
# Line 2684  for (;;) Line 2794  for (;;)
2794            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2795              {              {
2796              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2797              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2798              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2799              if ((prop_chartype == ucp_Lu ||              if ((prop_chartype == ucp_Lu ||
2800                   prop_chartype == ucp_Ll ||                   prop_chartype == ucp_Ll ||
# Line 2697  for (;;) Line 2807  for (;;)
2807            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2808              {              {
2809              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2810              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2811              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2812              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2813                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2708  for (;;) Line 2818  for (;;)
2818            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2819              {              {
2820              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2821              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2822              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2823              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2824                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2719  for (;;) Line 2829  for (;;)
2829            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2830              {              {
2831              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2832              GETCHARINC(c, eptr);              GETCHARINCTEST(c, eptr);
2833              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);              prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2834              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2835                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2800  for (;;) Line 2910  for (;;)
2910            }            }
2911          break;          break;
2912    
2913            case OP_NOT_HSPACE:
2914            for (i = 1; i <= min; i++)
2915              {
2916              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2917              GETCHARINC(c, eptr);
2918              switch(c)
2919                {
2920                default: break;
2921                case 0x09:      /* HT */
2922                case 0x20:      /* SPACE */
2923                case 0xa0:      /* NBSP */
2924                case 0x1680:    /* OGHAM SPACE MARK */
2925                case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2926                case 0x2000:    /* EN QUAD */
2927                case 0x2001:    /* EM QUAD */
2928                case 0x2002:    /* EN SPACE */
2929                case 0x2003:    /* EM SPACE */
2930                case 0x2004:    /* THREE-PER-EM SPACE */
2931                case 0x2005:    /* FOUR-PER-EM SPACE */
2932                case 0x2006:    /* SIX-PER-EM SPACE */
2933                case 0x2007:    /* FIGURE SPACE */
2934                case 0x2008:    /* PUNCTUATION SPACE */
2935                case 0x2009:    /* THIN SPACE */
2936                case 0x200A:    /* HAIR SPACE */
2937                case 0x202f:    /* NARROW NO-BREAK SPACE */
2938                case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2939                case 0x3000:    /* IDEOGRAPHIC SPACE */
2940                RRETURN(MATCH_NOMATCH);
2941                }
2942              }
2943            break;
2944    
2945            case OP_HSPACE:
2946            for (i = 1; i <= min; i++)
2947              {
2948              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949              GETCHARINC(c, eptr);
2950              switch(c)
2951                {
2952                default: RRETURN(MATCH_NOMATCH);
2953                case 0x09:      /* HT */
2954                case 0x20:      /* SPACE */
2955                case 0xa0:      /* NBSP */
2956                case 0x1680:    /* OGHAM SPACE MARK */
2957                case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2958                case 0x2000:    /* EN QUAD */
2959                case 0x2001:    /* EM QUAD */
2960                case 0x2002:    /* EN SPACE */
2961                case 0x2003:    /* EM SPACE */
2962                case 0x2004:    /* THREE-PER-EM SPACE */
2963                case 0x2005:    /* FOUR-PER-EM SPACE */
2964                case 0x2006:    /* SIX-PER-EM SPACE */
2965                case 0x2007:    /* FIGURE SPACE */
2966                case 0x2008:    /* PUNCTUATION SPACE */
2967                case 0x2009:    /* THIN SPACE */
2968                case 0x200A:    /* HAIR SPACE */
2969                case 0x202f:    /* NARROW NO-BREAK SPACE */
2970                case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2971                case 0x3000:    /* IDEOGRAPHIC SPACE */
2972                break;
2973                }
2974              }
2975            break;
2976    
2977            case OP_NOT_VSPACE:
2978            for (i = 1; i <= min; i++)
2979              {
2980              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2981              GETCHARINC(c, eptr);
2982              switch(c)
2983                {
2984                default: break;
2985                case 0x0a:      /* LF */
2986                case 0x0b:      /* VT */
2987                case 0x0c:      /* FF */
2988                case 0x0d:      /* CR */
2989                case 0x85:      /* NEL */
2990                case 0x2028:    /* LINE SEPARATOR */
2991                case 0x2029:    /* PARAGRAPH SEPARATOR */
2992                RRETURN(MATCH_NOMATCH);
2993                }
2994              }
2995            break;
2996    
2997            case OP_VSPACE:
2998            for (i = 1; i <= min; i++)
2999              {
3000              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3001              GETCHARINC(c, eptr);
3002              switch(c)
3003                {
3004                default: RRETURN(MATCH_NOMATCH);
3005                case 0x0a:      /* LF */
3006                case 0x0b:      /* VT */
3007                case 0x0c:      /* FF */
3008                case 0x0d:      /* CR */
3009                case 0x85:      /* NEL */
3010                case 0x2028:    /* LINE SEPARATOR */
3011                case 0x2029:    /* PARAGRAPH SEPARATOR */
3012                break;
3013                }
3014              }
3015            break;
3016    
3017          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
3018          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3019            {            {
# Line 2911  for (;;) Line 3125  for (;;)
3125            }            }
3126          break;          break;
3127    
3128            case OP_NOT_HSPACE:
3129            for (i = 1; i <= min; i++)
3130              {
3131              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132              switch(*eptr++)
3133                {
3134                default: break;
3135                case 0x09:      /* HT */
3136                case 0x20:      /* SPACE */
3137                case 0xa0:      /* NBSP */
3138                RRETURN(MATCH_NOMATCH);
3139                }
3140              }
3141            break;
3142    
3143            case OP_HSPACE:
3144            for (i = 1; i <= min; i++)
3145              {
3146              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3147              switch(*eptr++)
3148                {
3149                default: RRETURN(MATCH_NOMATCH);
3150                case 0x09:      /* HT */
3151                case 0x20:      /* SPACE */
3152                case 0xa0:      /* NBSP */
3153                break;
3154                }
3155              }
3156            break;
3157    
3158            case OP_NOT_VSPACE:
3159            for (i = 1; i <= min; i++)
3160              {
3161              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3162              switch(*eptr++)
3163                {
3164                default: break;
3165                case 0x0a:      /* LF */
3166                case 0x0b:      /* VT */
3167                case 0x0c:      /* FF */
3168                case 0x0d:      /* CR */
3169                case 0x85:      /* NEL */
3170                RRETURN(MATCH_NOMATCH);
3171                }
3172              }
3173            break;
3174    
3175            case OP_VSPACE:
3176            for (i = 1; i <= min; i++)
3177              {
3178              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3179              switch(*eptr++)
3180                {
3181                default: RRETURN(MATCH_NOMATCH);
3182                case 0x0a:      /* LF */
3183                case 0x0b:      /* VT */
3184                case 0x0c:      /* FF */
3185                case 0x0d:      /* CR */
3186                case 0x85:      /* NEL */
3187                break;
3188                }
3189              }
3190            break;
3191    
3192          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
3193          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
3194            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
# Line 3102  for (;;) Line 3380  for (;;)
3380                }                }
3381              break;              break;
3382    
3383                case OP_NOT_HSPACE:
3384                switch(c)
3385                  {
3386                  default: break;
3387                  case 0x09:      /* HT */
3388                  case 0x20:      /* SPACE */
3389                  case 0xa0:      /* NBSP */
3390                  case 0x1680:    /* OGHAM SPACE MARK */
3391                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3392                  case 0x2000:    /* EN QUAD */
3393                  case 0x2001:    /* EM QUAD */
3394                  case 0x2002:    /* EN SPACE */
3395                  case 0x2003:    /* EM SPACE */
3396                  case 0x2004:    /* THREE-PER-EM SPACE */
3397                  case 0x2005:    /* FOUR-PER-EM SPACE */
3398                  case 0x2006:    /* SIX-PER-EM SPACE */
3399                  case 0x2007:    /* FIGURE SPACE */
3400                  case 0x2008:    /* PUNCTUATION SPACE */
3401                  case 0x2009:    /* THIN SPACE */
3402                  case 0x200A:    /* HAIR SPACE */
3403                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3404                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3405                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3406                  RRETURN(MATCH_NOMATCH);
3407                  }
3408                break;
3409    
3410                case OP_HSPACE:
3411                switch(c)
3412                  {
3413                  default: RRETURN(MATCH_NOMATCH);
3414                  case 0x09:      /* HT */
3415                  case 0x20:      /* SPACE */
3416                  case 0xa0:      /* NBSP */
3417                  case 0x1680:    /* OGHAM SPACE MARK */
3418                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3419                  case 0x2000:    /* EN QUAD */
3420                  case 0x2001:    /* EM QUAD */
3421                  case 0x2002:    /* EN SPACE */
3422                  case 0x2003:    /* EM SPACE */
3423                  case 0x2004:    /* THREE-PER-EM SPACE */
3424                  case 0x2005:    /* FOUR-PER-EM SPACE */
3425                  case 0x2006:    /* SIX-PER-EM SPACE */
3426                  case 0x2007:    /* FIGURE SPACE */
3427                  case 0x2008:    /* PUNCTUATION SPACE */
3428                  case 0x2009:    /* THIN SPACE */
3429                  case 0x200A:    /* HAIR SPACE */
3430                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3431                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3432                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3433                  break;
3434                  }
3435                break;
3436    
3437                case OP_NOT_VSPACE:
3438                switch(c)
3439                  {
3440                  default: break;
3441                  case 0x0a:      /* LF */
3442                  case 0x0b:      /* VT */
3443                  case 0x0c:      /* FF */
3444                  case 0x0d:      /* CR */
3445                  case 0x85:      /* NEL */
3446                  case 0x2028:    /* LINE SEPARATOR */
3447                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3448                  RRETURN(MATCH_NOMATCH);
3449                  }
3450                break;
3451    
3452                case OP_VSPACE:
3453                switch(c)
3454                  {
3455                  default: RRETURN(MATCH_NOMATCH);
3456                  case 0x0a:      /* LF */
3457                  case 0x0b:      /* VT */
3458                  case 0x0c:      /* FF */
3459                  case 0x0d:      /* CR */
3460                  case 0x85:      /* NEL */
3461                  case 0x2028:    /* LINE SEPARATOR */
3462                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3463                  break;
3464                  }
3465                break;
3466    
3467              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3468              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3469                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 3173  for (;;) Line 3535  for (;;)
3535                }                }
3536              break;              break;
3537    
3538                case OP_NOT_HSPACE:
3539                switch(c)
3540                  {
3541                  default: break;
3542                  case 0x09:      /* HT */
3543                  case 0x20:      /* SPACE */
3544                  case 0xa0:      /* NBSP */
3545                  RRETURN(MATCH_NOMATCH);
3546                  }
3547                break;
3548    
3549                case OP_HSPACE:
3550                switch(c)
3551                  {
3552                  default: RRETURN(MATCH_NOMATCH);
3553                  case 0x09:      /* HT */
3554                  case 0x20:      /* SPACE */
3555                  case 0xa0:      /* NBSP */
3556                  break;
3557                  }
3558                break;
3559    
3560                case OP_NOT_VSPACE:
3561                switch(c)
3562                  {
3563                  default: break;
3564                  case 0x0a:      /* LF */
3565                  case 0x0b:      /* VT */
3566                  case 0x0c:      /* FF */
3567                  case 0x0d:      /* CR */
3568                  case 0x85:      /* NEL */
3569                  RRETURN(MATCH_NOMATCH);
3570                  }
3571                break;
3572    
3573                case OP_VSPACE:
3574                switch(c)
3575                  {
3576                  default: RRETURN(MATCH_NOMATCH);
3577                  case 0x0a:      /* LF */
3578                  case 0x0b:      /* VT */
3579                  case 0x0c:      /* FF */
3580                  case 0x0d:      /* CR */
3581                  case 0x85:      /* NEL */
3582                  break;
3583                  }
3584                break;
3585    
3586              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3587              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3588              break;              break;
# Line 3354  for (;;) Line 3764  for (;;)
3764          switch(ctype)          switch(ctype)
3765            {            {
3766            case OP_ANY:            case OP_ANY:
   
           /* Special code is required for UTF8, but when the maximum is  
           unlimited we don't need it, so we repeat the non-UTF8 code. This is  
           probably worth it, because .* is quite a common idiom. */  
   
3767            if (max < INT_MAX)            if (max < INT_MAX)
3768              {              {
3769              if ((ims & PCRE_DOTALL) == 0)              if ((ims & PCRE_DOTALL) == 0)
# Line 3391  for (;;) Line 3796  for (;;)
3796                  {                  {
3797                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3798                  eptr++;                  eptr++;
3799                    while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3800                  }                  }
               break;  
3801                }                }
3802              else              else
3803                {                {
3804                c = max - min;                eptr = md->end_subject;
               if (c > (unsigned int)(md->end_subject - eptr))  
                 c = md->end_subject - eptr;  
               eptr += c;  
3805                }                }
3806              }              }
3807            break;            break;
# Line 3434  for (;;) Line 3836  for (;;)
3836              }              }
3837            break;            break;
3838    
3839              case OP_NOT_HSPACE:
3840              case OP_HSPACE:
3841              for (i = min; i < max; i++)
3842                {
3843                BOOL gotspace;
3844                int len = 1;
3845                if (eptr >= md->end_subject) break;
3846                GETCHARLEN(c, eptr, len);
3847                switch(c)
3848                  {
3849                  default: gotspace = FALSE; break;
3850                  case 0x09:      /* HT */
3851                  case 0x20:      /* SPACE */
3852                  case 0xa0:      /* NBSP */
3853                  case 0x1680:    /* OGHAM SPACE MARK */
3854                  case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
3855                  case 0x2000:    /* EN QUAD */
3856                  case 0x2001:    /* EM QUAD */
3857                  case 0x2002:    /* EN SPACE */
3858                  case 0x2003:    /* EM SPACE */
3859                  case 0x2004:    /* THREE-PER-EM SPACE */
3860                  case 0x2005:    /* FOUR-PER-EM SPACE */
3861                  case 0x2006:    /* SIX-PER-EM SPACE */
3862                  case 0x2007:    /* FIGURE SPACE */
3863                  case 0x2008:    /* PUNCTUATION SPACE */
3864                  case 0x2009:    /* THIN SPACE */
3865                  case 0x200A:    /* HAIR SPACE */
3866                  case 0x202f:    /* NARROW NO-BREAK SPACE */
3867                  case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
3868                  case 0x3000:    /* IDEOGRAPHIC SPACE */
3869                  gotspace = TRUE;
3870                  break;
3871                  }
3872                if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3873                eptr += len;
3874                }
3875              break;
3876    
3877              case OP_NOT_VSPACE:
3878              case OP_VSPACE:
3879              for (i = min; i < max; i++)
3880                {
3881                BOOL gotspace;
3882                int len = 1;
3883                if (eptr >= md->end_subject) break;
3884                GETCHARLEN(c, eptr, len);
3885                switch(c)
3886                  {
3887                  default: gotspace = FALSE; break;
3888                  case 0x0a:      /* LF */
3889                  case 0x0b:      /* VT */
3890                  case 0x0c:      /* FF */
3891                  case 0x0d:      /* CR */
3892                  case 0x85:      /* NEL */
3893                  case 0x2028:    /* LINE SEPARATOR */
3894                  case 0x2029:    /* PARAGRAPH SEPARATOR */
3895                  gotspace = TRUE;
3896                  break;
3897                  }
3898                if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3899                eptr += len;
3900                }
3901              break;
3902    
3903            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3904            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3905              {              {
# Line 3560  for (;;) Line 4026  for (;;)
4026              }              }
4027            break;            break;
4028    
4029              case OP_NOT_HSPACE:
4030              for (i = min; i < max; i++)
4031                {
4032                if (eptr >= md->end_subject) break;
4033                c = *eptr;
4034                if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4035                eptr++;
4036                }
4037              break;
4038    
4039              case OP_HSPACE:
4040              for (i = min; i < max; i++)
4041                {
4042                if (eptr >= md->end_subject) break;
4043                c = *eptr;
4044                if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4045                eptr++;
4046                }
4047              break;
4048    
4049              case OP_NOT_VSPACE:
4050              for (i = min; i < max; i++)
4051                {
4052                if (eptr >= md->end_subject) break;
4053                c = *eptr;
4054                if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4055                  break;
4056                eptr++;
4057                }
4058              break;
4059    
4060              case OP_VSPACE:
4061              for (i = min; i < max; i++)
4062                {
4063                if (eptr >= md->end_subject) break;
4064                c = *eptr;
4065                if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4066                  break;
4067                eptr++;
4068                }
4069              break;
4070    
4071            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
4072            for (i = min; i < max; i++)            for (i = min; i < max; i++)
4073              {              {
# Line 3651  for (;;) Line 4159  for (;;)
4159  /* Control never reaches here */  /* Control never reaches here */
4160    
4161    
4162  /* When compiling to use the heap rather than the stack for recursive calls to  /* When compiling to use the heap rather than the stack for recursive calls to
4163  match(), the RRETURN() macro jumps here. The number that is saved in  match(), the RRETURN() macro jumps here. The number that is saved in
4164  frame->Xwhere indicates which label we actually want to return to. */  frame->Xwhere indicates which label we actually want to return to. */
4165    
4166  #ifdef NO_RECURSE  #ifdef NO_RECURSE
# Line 3670  switch (frame->Xwhere) Line 4178  switch (frame->Xwhere)
4178    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));    DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4179    return PCRE_ERROR_INTERNAL;    return PCRE_ERROR_INTERNAL;
4180    }    }
4181  #undef LBL  #undef LBL
4182  #endif  /* NO_RECURSE */  #endif  /* NO_RECURSE */
4183  }  }
4184    
# Line 3684  Undefine all the macros that were define Line 4192  Undefine all the macros that were define
4192  #ifdef NO_RECURSE  #ifdef NO_RECURSE
4193  #undef eptr  #undef eptr
4194  #undef ecode  #undef ecode
4195    #undef mstart
4196  #undef offset_top  #undef offset_top
4197  #undef ims  #undef ims
4198  #undef eptrb  #undef eptrb
# Line 4163  for(;;) Line 4672  for(;;)
4672    
4673    /* OK, we can now run the match. */    /* OK, we can now run the match. */
4674    
4675    md->start_match = start_match;    md->start_match_ptr = start_match;      /* Insurance */
4676    md->match_call_count = 0;    md->match_call_count = 0;
4677    md->eptrn = 0;                          /* Next free eptrchain slot */    md->eptrn = 0;                          /* Next free eptrchain slot */
4678    rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);    rc = match(start_match, md->start_code, start_match, 2, md,
4679        ims, NULL, 0, 0);
4680    
4681    /* Any return other than MATCH_NOMATCH breaks the loop. */    /* Any return other than MATCH_NOMATCH breaks the loop. */
4682    
# Line 4246  if (rc == MATCH_MATCH) Line 4756  if (rc == MATCH_MATCH)
4756    
4757    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = md->offset_overflow? 0 : md->end_offset_top/2;
4758    
4759    /* If there is space, set up the whole thing as substring 0. */    /* If there is space, set up the whole thing as substring 0. The value of
4760      md->start_match_ptr might be modified if \K was encountered on the success
4761      matching path. */
4762    
4763    if (offsetcount < 2) rc = 0; else    if (offsetcount < 2) rc = 0; else
4764      {      {
4765      offsets[0] = start_match - md->start_subject;      offsets[0] = md->start_match_ptr - md->start_subject;
4766      offsets[1] = md->end_match_ptr - md->start_subject;      offsets[1] = md->end_match_ptr - md->start_subject;
4767      }      }
4768    

Legend:
Removed from v.164  
changed lines
  Added in v.190

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12