123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 |
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains an internal function that is used to match an extended
- class. It is used by both pcre_exec() and pcre_def_exec(). */
- #ifdef HAVE_CONFIG_H
- #include "pcre_config.h"
- #endif
- #include "pcre_internal.h"
- /*************************************************
- * Match character against an XCLASS *
- *************************************************/
- /* This function is called to match a character against an extended class that
- might contain values > 255 and/or Unicode properties.
- Arguments:
- c the character
- data points to the flag byte of the XCLASS data
- Returns: TRUE if character matches, else FALSE
- */
- BOOL
- PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
- {
- pcre_uchar t;
- BOOL negated = (*data & XCL_NOT) != 0;
- (void)utf;
- #ifdef COMPILE_PCRE8
- /* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
- utf = TRUE;
- #endif
- /* Character values < 256 are matched against a bitmap, if one is present. If
- not, we still carry on, because there may be ranges that start below 256 in the
- additional data. */
- if (c < 256)
- {
- if ((*data & XCL_HASPROP) == 0)
- {
- if ((*data & XCL_MAP) == 0) return negated;
- return (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0;
- }
- if ((*data & XCL_MAP) != 0 &&
- (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
- return !negated; /* char found */
- }
- /* First skip the bit map if present. Then match against the list of Unicode
- properties or large chars or ranges that end with a large char. We won't ever
- encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
- if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);
- while ((t = *data++) != XCL_END)
- {
- pcre_uint32 x, y;
- if (t == XCL_SINGLE)
- {
- #ifdef SUPPORT_UTF
- if (utf)
- {
- GETCHARINC(x, data); /* macro generates multiple statements */
- }
- else
- #endif
- x = *data++;
- if (c == x) return !negated;
- }
- else if (t == XCL_RANGE)
- {
- #ifdef SUPPORT_UTF
- if (utf)
- {
- GETCHARINC(x, data); /* macro generates multiple statements */
- GETCHARINC(y, data); /* macro generates multiple statements */
- }
- else
- #endif
- {
- x = *data++;
- y = *data++;
- }
- if (c >= x && c <= y) return !negated;
- }
- #ifdef SUPPORT_UCP
- else /* XCL_PROP & XCL_NOTPROP */
- {
- const ucd_record *prop = GET_UCD(c);
- BOOL isprop = t == XCL_PROP;
- switch(*data)
- {
- case PT_ANY:
- if (isprop) return !negated;
- break;
- case PT_LAMP:
- if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
- prop->chartype == ucp_Lt) == isprop) return !negated;
- break;
- case PT_GC:
- if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
- return !negated;
- break;
- case PT_PC:
- if ((data[1] == prop->chartype) == isprop) return !negated;
- break;
- case PT_SC:
- if ((data[1] == prop->script) == isprop) return !negated;
- break;
- case PT_ALNUM:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
- return !negated;
- break;
- /* Perl space used to exclude VT, but from Perl 5.18 it is included,
- which means that Perl space and POSIX space are now identical. PCRE
- was changed at release 8.34. */
- case PT_SPACE: /* Perl space */
- case PT_PXSPACE: /* POSIX space */
- switch(c)
- {
- HSPACE_CASES:
- VSPACE_CASES:
- if (isprop) return !negated;
- break;
- default:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
- return !negated;
- break;
- }
- break;
- case PT_WORD:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
- PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
- == isprop)
- return !negated;
- break;
- case PT_UCNC:
- if (c < 0xa0)
- {
- if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
- c == CHAR_GRAVE_ACCENT) == isprop)
- return !negated;
- }
- else
- {
- if ((c < 0xd800 || c > 0xdfff) == isprop)
- return !negated;
- }
- break;
- /* The following three properties can occur only in an XCLASS, as there
- is no \p or \P coding for them. */
- /* Graphic character. Implement this as not Z (space or separator) and
- not C (other), except for Cf (format) with a few exceptions. This seems
- to be what Perl does. The exceptional characters are:
- U+061C Arabic Letter Mark
- U+180E Mongolian Vowel Separator
- U+2066 - U+2069 Various "isolate"s
- */
- case PT_PXGRAPH:
- if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
- (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
- (prop->chartype == ucp_Cf &&
- c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
- )) == isprop)
- return !negated;
- break;
- /* Printable character: same as graphic, with the addition of Zs, i.e.
- not Zl and not Zp, and U+180E. */
- case PT_PXPRINT:
- if ((prop->chartype != ucp_Zl &&
- prop->chartype != ucp_Zp &&
- (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
- (prop->chartype == ucp_Cf &&
- c != 0x061c && (c < 0x2066 || c > 0x2069))
- )) == isprop)
- return !negated;
- break;
- /* Punctuation: all Unicode punctuation, plus ASCII characters that
- Unicode treats as symbols rather than punctuation, for Perl
- compatibility (these are $+<=>^`|~). */
- case PT_PXPUNCT:
- if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
- (c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
- return !negated;
- break;
- /* This should never occur, but compilers may mutter if there is no
- default. */
- default:
- return FALSE;
- }
- data += 2;
- }
- #endif /* SUPPORT_UCP */
- }
- return negated; /* char did not match */
- }
- /* End of pcre_xclass.c */
|