12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- *
- * Copyright (C) 1999-2014, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: uniset_props.cpp
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2004aug25
- * created by: Markus W. Scherer
- *
- * Character property dependent functions moved here from uniset.cpp
- */
- #include "unicode/utypes.h"
- #include "unicode/uniset.h"
- #include "unicode/parsepos.h"
- #include "unicode/uchar.h"
- #include "unicode/uscript.h"
- #include "unicode/symtable.h"
- #include "unicode/uset.h"
- #include "unicode/locid.h"
- #include "unicode/brkiter.h"
- #include "uset_imp.h"
- #include "ruleiter.h"
- #include "cmemory.h"
- #include "ucln_cmn.h"
- #include "util.h"
- #include "uvector.h"
- #include "uprops.h"
- #include "propname.h"
- #include "normalizer2impl.h"
- #include "uinvchar.h"
- #include "uprops.h"
- #include "charstr.h"
- #include "cstring.h"
- #include "mutex.h"
- #include "umutex.h"
- #include "uassert.h"
- #include "hash.h"
- U_NAMESPACE_USE
- namespace {
- // Special property set IDs
- constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
- constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
- constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
- // Unicode name property alias
- constexpr char16_t NAME_PROP[] = u"na";
- } // namespace
- // Cached sets ------------------------------------------------------------- ***
- U_CDECL_BEGIN
- static UBool U_CALLCONV uset_cleanup();
- static UnicodeSet *uni32Singleton;
- static icu::UInitOnce uni32InitOnce {};
- /**
- * Cleanup function for UnicodeSet
- */
- static UBool U_CALLCONV uset_cleanup() {
- delete uni32Singleton;
- uni32Singleton = nullptr;
- uni32InitOnce.reset();
- return true;
- }
- U_CDECL_END
- U_NAMESPACE_BEGIN
- namespace {
- // Cache some sets for other services -------------------------------------- ***
- void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
- U_ASSERT(uni32Singleton == nullptr);
- uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode);
- if(uni32Singleton==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- } else {
- uni32Singleton->freeze();
- }
- ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
- }
- U_CFUNC UnicodeSet *
- uniset_getUnicode32Instance(UErrorCode &errorCode) {
- umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
- return uni32Singleton;
- }
- // helper functions for matching of pattern syntax pieces ------------------ ***
- // these functions are parallel to the PERL_OPEN etc. strings above
- // using these functions is not only faster than UnicodeString::compare() and
- // caseCompare(), but they also make UnicodeSet work for simple patterns when
- // no Unicode properties data is available - when caseCompare() fails
- inline UBool
- isPerlOpen(const UnicodeString &pattern, int32_t pos) {
- char16_t c;
- return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
- }
- /*static inline UBool
- isPerlClose(const UnicodeString &pattern, int32_t pos) {
- return pattern.charAt(pos)==u'}';
- }*/
- inline UBool
- isNameOpen(const UnicodeString &pattern, int32_t pos) {
- return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
- }
- inline UBool
- isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
- return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
- }
- /*static inline UBool
- isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
- return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
- }*/
- // TODO memory debugging provided inside uniset.cpp
- // could be made available here but probably obsolete with use of modern
- // memory leak checker tools
- #define _dbgct(me)
- } // namespace
- //----------------------------------------------------------------
- // Constructors &c
- //----------------------------------------------------------------
- /**
- * Constructs a set from the given pattern, optionally ignoring
- * white space. See the class description for the syntax of the
- * pattern language.
- * @param pattern a string specifying what characters are in the set
- */
- UnicodeSet::UnicodeSet(const UnicodeString& pattern,
- UErrorCode& status) {
- applyPattern(pattern, status);
- _dbgct(this);
- }
- //----------------------------------------------------------------
- // Public API
- //----------------------------------------------------------------
- UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
- UErrorCode& status) {
- // Equivalent to
- // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
- // but without dependency on closeOver().
- ParsePosition pos(0);
- applyPatternIgnoreSpace(pattern, pos, nullptr, status);
- if (U_FAILURE(status)) return *this;
- int32_t i = pos.getIndex();
- // Skip over trailing whitespace
- ICU_Utility::skipWhitespace(pattern, i, true);
- if (i != pattern.length()) {
- status = U_ILLEGAL_ARGUMENT_ERROR;
- }
- return *this;
- }
- void
- UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
- ParsePosition& pos,
- const SymbolTable* symbols,
- UErrorCode& status) {
- if (U_FAILURE(status)) {
- return;
- }
- if (isFrozen()) {
- status = U_NO_WRITE_PERMISSION;
- return;
- }
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- UnicodeString rebuiltPat;
- RuleCharacterIterator chars(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
- if (U_FAILURE(status)) return;
- if (chars.inVariable()) {
- // syntaxError(chars, "Extra chars in variable value");
- status = U_MALFORMED_SET;
- return;
- }
- setPattern(rebuiltPat);
- }
- /**
- * Return true if the given position, in the given pattern, appears
- * to be the start of a UnicodeSet pattern.
- */
- UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
- return ((pos+1) < pattern.length() &&
- pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) ||
- resemblesPropertyPattern(pattern, pos);
- }
- //----------------------------------------------------------------
- // Implementation: Pattern parsing
- //----------------------------------------------------------------
- namespace {
- /**
- * A small all-inline class to manage a UnicodeSet pointer. Add
- * operator->() etc. as needed.
- */
- class UnicodeSetPointer {
- UnicodeSet* p;
- public:
- inline UnicodeSetPointer() : p(nullptr) {}
- inline ~UnicodeSetPointer() { delete p; }
- inline UnicodeSet* pointer() { return p; }
- inline UBool allocate() {
- if (p == nullptr) {
- p = new UnicodeSet();
- }
- return p != nullptr;
- }
- };
- constexpr int32_t MAX_DEPTH = 100;
- } // namespace
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
- const SymbolTable* symbols,
- UnicodeString& rebuiltPat,
- uint32_t options,
- UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
- int32_t depth,
- UErrorCode& ec) {
- if (U_FAILURE(ec)) return;
- if (depth > MAX_DEPTH) {
- ec = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- // Syntax characters: [ ] ^ - & { }
- // Recognized special forms for chars, sets: c-c s-s s&s
- int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
- RuleCharacterIterator::PARSE_ESCAPES;
- if ((options & USET_IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator::SKIP_WHITESPACE;
- }
- UnicodeString patLocal, buf;
- UBool usePat = false;
- UnicodeSetPointer scratch;
- RuleCharacterIterator::Pos backup;
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int8_t lastItem = 0, mode = 0;
- UChar32 lastChar = 0;
- char16_t op = 0;
- UBool invert = false;
- clear();
- while (mode != 2 && !chars.atEnd()) {
- U_ASSERT((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == u'-')) ||
- (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
- UChar32 c = 0;
- UBool literal = false;
- UnicodeSet* nested = nullptr; // alias - do not delete
- // -------- Check for property pattern
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int8_t setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
- else {
- // Prepare to backup if necessary
- chars.getPos(backup);
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- if (c == u'[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patLocal.append(u'[');
- chars.getPos(backup); // prepare to backup
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- if (c == u'^' && !literal) {
- invert = true;
- patLocal.append(u'^');
- chars.getPos(backup); // prepare to backup
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == u'-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != nullptr) {
- const UnicodeFunctor *m = symbols->lookupMatcher(c);
- if (m != nullptr) {
- const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
- if (ms == nullptr) {
- ec = U_MALFORMED_SET;
- return;
- }
- // casting away const, but `nested' won't be modified
- // (important not to modify stored set)
- nested = const_cast<UnicodeSet*>(ms);
- setMode = 3;
- }
- }
- }
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- // syntaxError(chars, "Char expected after operator");
- ec = U_MALFORMED_SET;
- return;
- }
- add(lastChar, lastChar);
- _appendToPat(patLocal, lastChar, false);
- lastItem = 0;
- op = 0;
- }
- if (op == u'-' || op == u'&') {
- patLocal.append(op);
- }
- if (nested == nullptr) {
- // lazy allocation
- if (!scratch.allocate()) {
- ec = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- nested = scratch.pointer();
- }
- switch (setMode) {
- case 1:
- nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested->applyPropertyPattern(chars, patLocal, ec);
- if (U_FAILURE(ec)) return;
- break;
- case 3: // `nested' already parsed
- nested->_toPattern(patLocal, false);
- break;
- }
- usePat = true;
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- *this = *nested;
- mode = 2;
- break;
- }
- switch (op) {
- case u'-':
- removeAll(*nested);
- break;
- case u'&':
- retainAll(*nested);
- break;
- case 0:
- addAll(*nested);
- break;
- }
- op = 0;
- lastItem = 2;
- continue;
- }
- if (mode == 0) {
- // syntaxError(chars, "Missing '['");
- ec = U_MALFORMED_SET;
- return;
- }
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
- if (!literal) {
- switch (c) {
- case u']':
- if (lastItem == 1) {
- add(lastChar, lastChar);
- _appendToPat(patLocal, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == u'-') {
- add(op, op);
- patLocal.append(op);
- } else if (op == u'&') {
- // syntaxError(chars, "Trailing '&'");
- ec = U_MALFORMED_SET;
- return;
- }
- patLocal.append(u']');
- mode = 2;
- continue;
- case u'-':
- if (op == 0) {
- if (lastItem != 0) {
- op = static_cast<char16_t>(c);
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add(c, c);
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- if (c == u']' && !literal) {
- patLocal.append(u"-]", 2);
- mode = 2;
- continue;
- }
- }
- }
- // syntaxError(chars, "'-' not after char or set");
- ec = U_MALFORMED_SET;
- return;
- case u'&':
- if (lastItem == 2 && op == 0) {
- op = static_cast<char16_t>(c);
- continue;
- }
- // syntaxError(chars, "'&' not after set");
- ec = U_MALFORMED_SET;
- return;
- case u'^':
- // syntaxError(chars, "'^' not after '['");
- ec = U_MALFORMED_SET;
- return;
- case u'{':
- if (op != 0) {
- // syntaxError(chars, "Missing operand after operator");
- ec = U_MALFORMED_SET;
- return;
- }
- if (lastItem == 1) {
- add(lastChar, lastChar);
- _appendToPat(patLocal, lastChar, false);
- }
- lastItem = 0;
- buf.truncate(0);
- {
- UBool ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- if (c == u'}' && !literal) {
- ok = true;
- break;
- }
- buf.append(c);
- }
- if (!ok) {
- // syntaxError(chars, "Invalid multicharacter string");
- ec = U_MALFORMED_SET;
- return;
- }
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf);
- patLocal.append(u'{');
- _appendToPat(patLocal, buf, false);
- patLocal.append(u'}');
- continue;
- case SymbolTable::SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- {
- chars.getPos(backup);
- c = chars.next(opts, literal, ec);
- if (U_FAILURE(ec)) return;
- UBool anchor = (c == u']' && !literal);
- if (symbols == nullptr && !anchor) {
- c = SymbolTable::SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add(lastChar, lastChar);
- _appendToPat(patLocal, lastChar, false);
- }
- add(U_ETHER);
- usePat = true;
- patLocal.append(static_cast<char16_t>(SymbolTable::SYMBOL_REF));
- patLocal.append(u']');
- mode = 2;
- continue;
- }
- // syntaxError(chars, "Unquoted '$'");
- ec = U_MALFORMED_SET;
- return;
- }
- default:
- break;
- }
- }
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == u'-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- // syntaxError(chars, "Invalid range");
- ec = U_MALFORMED_SET;
- return;
- }
- add(lastChar, c);
- _appendToPat(patLocal, lastChar, false);
- patLocal.append(op);
- _appendToPat(patLocal, c, false);
- lastItem = 0;
- op = 0;
- } else {
- add(lastChar, lastChar);
- _appendToPat(patLocal, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- // syntaxError(chars, "Set expected after operator");
- ec = U_MALFORMED_SET;
- return;
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
- if (mode != 2) {
- // syntaxError(chars, "Missing ']'");
- ec = U_MALFORMED_SET;
- return;
- }
- chars.skipIgnored(opts);
- /**
- * Handle global flags (invert, case insensitivity). If this
- * pattern should be compiled case-insensitive, then we need
- * to close over case BEFORE COMPLEMENTING. This makes
- * patterns like /[^abc]/i work.
- */
- if ((options & USET_CASE_MASK) != 0) {
- (this->*caseClosure)(options);
- }
- if (invert) {
- complement().removeAllStrings(); // code point complement
- }
- // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patLocal);
- } else {
- _generatePattern(rebuiltPat, false);
- }
- if (isBogus() && U_SUCCESS(ec)) {
- // We likely ran out of memory. AHHH!
- ec = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- //----------------------------------------------------------------
- // Property set implementation
- //----------------------------------------------------------------
- namespace {
- UBool numericValueFilter(UChar32 ch, void* context) {
- return u_getNumericValue(ch) == *static_cast<double*>(context);
- }
- UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
- int32_t value = *static_cast<int32_t*>(context);
- return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
- }
- UBool versionFilter(UChar32 ch, void* context) {
- static const UVersionInfo none = { 0, 0, 0, 0 };
- UVersionInfo v;
- u_charAge(ch, v);
- UVersionInfo* version = static_cast<UVersionInfo*>(context);
- return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
- }
- typedef struct {
- UProperty prop;
- int32_t value;
- } IntPropertyContext;
- UBool intPropertyFilter(UChar32 ch, void* context) {
- IntPropertyContext* c = static_cast<IntPropertyContext*>(context);
- return u_getIntPropertyValue(ch, c->prop) == c->value;
- }
- UBool scriptExtensionsFilter(UChar32 ch, void* context) {
- return uscript_hasScript(ch, *static_cast<UScriptCode*>(context));
- }
- UBool idTypeFilter(UChar32 ch, void* context) {
- return u_hasIDType(ch, *static_cast<UIdentifierType*>(context));
- }
- } // namespace
- /**
- * Generic filter-based scanning code for UCD property UnicodeSets.
- */
- void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
- void* context,
- const UnicodeSet* inclusions,
- UErrorCode &status) {
- if (U_FAILURE(status)) return;
- // Logically, walk through all Unicode characters, noting the start
- // and end of each range for which filter.contain(c) is
- // true. Add each range to a set.
- //
- // To improve performance, use an inclusions set which
- // encodes information about character ranges that are known
- // to have identical properties.
- // inclusions contains the first characters of
- // same-value ranges for the given property.
- clear();
- UChar32 startHasProperty = -1;
- int32_t limitRange = inclusions->getRangeCount();
- for (int j=0; j<limitRange; ++j) {
- // get current range
- UChar32 start = inclusions->getRangeStart(j);
- UChar32 end = inclusions->getRangeEnd(j);
- // for all the code points in the range, process
- for (UChar32 ch = start; ch <= end; ++ch) {
- // only add to this UnicodeSet on inflection points --
- // where the hasProperty value changes to false
- if ((*filter)(ch, context)) {
- if (startHasProperty < 0) {
- startHasProperty = ch;
- }
- } else if (startHasProperty >= 0) {
- add(startHasProperty, ch-1);
- startHasProperty = -1;
- }
- }
- }
- if (startHasProperty >= 0) {
- add(startHasProperty, static_cast<UChar32>(0x10FFFF));
- }
- if (isBogus() && U_SUCCESS(status)) {
- // We likely ran out of memory. AHHH!
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- namespace {
- UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
- /* Note: we use ' ' in compiler code page */
- int32_t j = 0;
- char ch;
- --dstCapacity; /* make room for term. zero */
- while ((ch = *src++) != 0) {
- if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
- continue;
- }
- if (j >= dstCapacity) return false;
- dst[j++] = ch;
- }
- if (j > 0 && dst[j-1] == ' ') --j;
- dst[j] = 0;
- return true;
- }
- } // namespace
- //----------------------------------------------------------------
- // Property set API
- //----------------------------------------------------------------
- #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
- ec=U_ILLEGAL_ARGUMENT_ERROR; \
- return *this; \
- } UPRV_BLOCK_MACRO_END
- UnicodeSet&
- UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
- if (U_FAILURE(ec) || isFrozen()) { return *this; }
- if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
- const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
- applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
- } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
- const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
- UScriptCode script = static_cast<UScriptCode>(value);
- applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
- } else if (prop == UCHAR_IDENTIFIER_TYPE) {
- const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
- UIdentifierType idType = static_cast<UIdentifierType>(value);
- applyFilter(idTypeFilter, &idType, inclusions, ec);
- } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
- if (value == 0 || value == 1) {
- const USet *set = u_getBinaryPropertySet(prop, &ec);
- if (U_FAILURE(ec)) { return *this; }
- copyFrom(*UnicodeSet::fromUSet(set), true);
- if (value == 0) {
- complement().removeAllStrings(); // code point complement
- }
- } else {
- clear();
- }
- } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
- const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
- IntPropertyContext c = {prop, value};
- applyFilter(intPropertyFilter, &c, inclusions, ec);
- } else {
- ec = U_ILLEGAL_ARGUMENT_ERROR;
- }
- return *this;
- }
- UnicodeSet&
- UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
- const UnicodeString& value,
- UErrorCode& ec) {
- if (U_FAILURE(ec) || isFrozen()) return *this;
- // prop and value used to be converted to char * using the default
- // converter instead of the invariant conversion.
- // This should not be necessary because all Unicode property and value
- // names use only invariant characters.
- // If there are any variant characters, then we won't find them anyway.
- // Checking first avoids assertion failures in the conversion.
- if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
- !uprv_isInvariantUString(value.getBuffer(), value.length())
- ) {
- FAIL(ec);
- }
- CharString pname, vname;
- pname.appendInvariantChars(prop, ec);
- vname.appendInvariantChars(value, ec);
- if (U_FAILURE(ec)) return *this;
- UProperty p;
- int32_t v;
- UBool invert = false;
- if (value.length() > 0) {
- p = u_getPropertyEnum(pname.data());
- if (p == UCHAR_INVALID_CODE) FAIL(ec);
- // Treat gc as gcm
- if (p == UCHAR_GENERAL_CATEGORY) {
- p = UCHAR_GENERAL_CATEGORY_MASK;
- }
- if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
- (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
- (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
- v = u_getPropertyValueEnum(p, vname.data());
- if (v == UCHAR_INVALID_CODE) {
- // Handle numeric CCC
- if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
- p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
- p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
- char* end;
- double val = uprv_strtod(vname.data(), &end);
- // Anything between 0 and 255 is valid even if unused.
- // Cast double->int only after range check.
- // We catch NaN here because comparing it with both 0 and 255 will be false
- // (as are all comparisons with NaN).
- if (*end != 0 || !(0 <= val && val <= 255) ||
- (v = static_cast<int32_t>(val)) != val) {
- // non-integral value or outside 0..255, or trailing junk
- FAIL(ec);
- }
- } else {
- FAIL(ec);
- }
- }
- }
- else {
- switch (p) {
- case UCHAR_NUMERIC_VALUE:
- {
- char* end;
- double val = uprv_strtod(vname.data(), &end);
- if (*end != 0) {
- FAIL(ec);
- }
- applyFilter(numericValueFilter, &val,
- CharacterProperties::getInclusionsForProperty(p, ec), ec);
- return *this;
- }
- case UCHAR_NAME:
- {
- // Must munge name, since u_charFromName() does not do
- // 'loose' matching.
- char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
- if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
- UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
- if (U_SUCCESS(ec)) {
- clear();
- add(ch);
- return *this;
- } else {
- FAIL(ec);
- }
- }
- case UCHAR_UNICODE_1_NAME:
- // ICU 49 deprecates the Unicode_1_Name property APIs.
- FAIL(ec);
- case UCHAR_AGE:
- {
- // Must munge name, since u_versionFromString() does not do
- // 'loose' matching.
- char buf[128];
- if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
- UVersionInfo version;
- u_versionFromString(version, buf);
- applyFilter(versionFilter, &version,
- CharacterProperties::getInclusionsForProperty(p, ec), ec);
- return *this;
- }
- case UCHAR_SCRIPT_EXTENSIONS:
- v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
- if (v == UCHAR_INVALID_CODE) {
- FAIL(ec);
- }
- // fall through to calling applyIntPropertyValue()
- break;
- case UCHAR_IDENTIFIER_TYPE:
- v = u_getPropertyValueEnum(p, vname.data());
- if (v == UCHAR_INVALID_CODE) {
- FAIL(ec);
- }
- // fall through to calling applyIntPropertyValue()
- break;
- default:
- // p is a non-binary, non-enumerated property that we
- // don't support (yet).
- FAIL(ec);
- }
- }
- }
- else {
- // value is empty. Interpret as General Category, Script, or
- // Binary property.
- p = UCHAR_GENERAL_CATEGORY_MASK;
- v = u_getPropertyValueEnum(p, pname.data());
- if (v == UCHAR_INVALID_CODE) {
- p = UCHAR_SCRIPT;
- v = u_getPropertyValueEnum(p, pname.data());
- if (v == UCHAR_INVALID_CODE) {
- p = u_getPropertyEnum(pname.data());
- if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
- v = 1;
- } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
- set(MIN_VALUE, MAX_VALUE);
- return *this;
- } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
- set(0, 0x7F);
- return *this;
- } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
- // [:Assigned:]=[:^Cn:]
- p = UCHAR_GENERAL_CATEGORY_MASK;
- v = U_GC_CN_MASK;
- invert = true;
- } else {
- FAIL(ec);
- }
- }
- }
- }
- applyIntPropertyValue(p, v, ec);
- if(invert) {
- complement().removeAllStrings(); // code point complement
- }
- if (isBogus() && U_SUCCESS(ec)) {
- // We likely ran out of memory. AHHH!
- ec = U_MEMORY_ALLOCATION_ERROR;
- }
- return *this;
- }
- //----------------------------------------------------------------
- // Property set patterns
- //----------------------------------------------------------------
- /**
- * Return true if the given position, in the given pattern, appears
- * to be the start of a property set pattern.
- */
- UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
- int32_t pos) {
- // Patterns are at least 5 characters long
- if ((pos+5) > pattern.length()) {
- return false;
- }
- // Look for an opening [:, [:^, \p, or \P
- return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
- }
- /**
- * Return true if the given iterator appears to point at a
- * property pattern. Regardless of the result, return with the
- * iterator unchanged.
- * @param chars iterator over the pattern characters. Upon return
- * it will be unchanged.
- * @param iterOpts RuleCharacterIterator options
- */
- UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
- int32_t iterOpts) {
- // NOTE: literal will always be false, because we don't parse escapes.
- UBool result = false, literal;
- UErrorCode ec = U_ZERO_ERROR;
- iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
- RuleCharacterIterator::Pos pos;
- chars.getPos(pos);
- UChar32 c = chars.next(iterOpts, literal, ec);
- if (c == u'[' || c == u'\\') {
- UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
- literal, ec);
- result = (c == u'[') ? (d == u':') :
- (d == u'N' || d == u'p' || d == u'P');
- }
- chars.setPos(pos);
- return result && U_SUCCESS(ec);
- }
- /**
- * Parse the given property pattern at the given parse position.
- */
- UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
- ParsePosition& ppos,
- UErrorCode &ec) {
- int32_t pos = ppos.getIndex();
- UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
- UBool isName = false; // true for \N{pat}, o/w false
- UBool invert = false;
- if (U_FAILURE(ec)) return *this;
- // Minimum length is 5 characters, e.g. \p{L}
- if ((pos+5) > pattern.length()) {
- FAIL(ec);
- }
- // On entry, ppos should point to one of the following locations:
- // Look for an opening [:, [:^, \p, or \P
- if (isPOSIXOpen(pattern, pos)) {
- posix = true;
- pos += 2;
- pos = ICU_Utility::skipWhitespace(pattern, pos);
- if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
- ++pos;
- invert = true;
- }
- } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
- char16_t c = pattern.charAt(pos+1);
- invert = (c == u'P');
- isName = (c == u'N');
- pos += 2;
- pos = ICU_Utility::skipWhitespace(pattern, pos);
- if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
- // Syntax error; "\p" or "\P" not followed by "{"
- FAIL(ec);
- }
- } else {
- // Open delimiter not seen
- FAIL(ec);
- }
- // Look for the matching close delimiter, either :] or }
- int32_t close;
- if (posix) {
- close = pattern.indexOf(u":]", 2, pos);
- } else {
- close = pattern.indexOf(u'}', pos);
- }
- if (close < 0) {
- // Syntax error; close delimiter missing
- FAIL(ec);
- }
- // Look for an '=' sign. If this is present, we will parse a
- // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
- // pattern.
- int32_t equals = pattern.indexOf(u'=', pos);
- UnicodeString propName, valueName;
- if (equals >= 0 && equals < close && !isName) {
- // Equals seen; parse medium/long pattern
- pattern.extractBetween(pos, equals, propName);
- pattern.extractBetween(equals+1, close, valueName);
- }
- else {
- // Handle case where no '=' is seen, and \N{}
- pattern.extractBetween(pos, close, propName);
-
- // Handle \N{name}
- if (isName) {
- // This is a little inefficient since it means we have to
- // parse NAME_PROP back to UCHAR_NAME even though we already
- // know it's UCHAR_NAME. If we refactor the API to
- // support args of (UProperty, char*) then we can remove
- // NAME_PROP and make this a little more efficient.
- valueName = propName;
- propName = NAME_PROP;
- }
- }
- applyPropertyAlias(propName, valueName, ec);
- if (U_SUCCESS(ec)) {
- if (invert) {
- complement().removeAllStrings(); // code point complement
- }
- // Move to the limit position after the close delimiter if the
- // parse succeeded.
- ppos.setIndex(close + (posix ? 2 : 1));
- }
- return *this;
- }
- /**
- * Parse a property pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- */
- void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
- UnicodeString& rebuiltPat,
- UErrorCode& ec) {
- if (U_FAILURE(ec)) return;
- UnicodeString pattern;
- chars.lookahead(pattern);
- ParsePosition pos(0);
- applyPropertyPattern(pattern, pos, ec);
- if (U_FAILURE(ec)) return;
- if (pos.getIndex() == 0) {
- // syntaxError(chars, "Invalid property pattern");
- ec = U_MALFORMED_SET;
- return;
- }
- chars.jumpahead(pos.getIndex());
- rebuiltPat.append(pattern, 0, pos.getIndex());
- }
- U_NAMESPACE_END
|