uniset_props.cpp 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uniset_props.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2004aug25
  16. * created by: Markus W. Scherer
  17. *
  18. * Character property dependent functions moved here from uniset.cpp
  19. */
  20. #include "unicode/utypes.h"
  21. #include "unicode/uniset.h"
  22. #include "unicode/parsepos.h"
  23. #include "unicode/uchar.h"
  24. #include "unicode/uscript.h"
  25. #include "unicode/symtable.h"
  26. #include "unicode/uset.h"
  27. #include "unicode/locid.h"
  28. #include "unicode/brkiter.h"
  29. #include "uset_imp.h"
  30. #include "ruleiter.h"
  31. #include "cmemory.h"
  32. #include "ucln_cmn.h"
  33. #include "util.h"
  34. #include "uvector.h"
  35. #include "uprops.h"
  36. #include "propname.h"
  37. #include "normalizer2impl.h"
  38. #include "uinvchar.h"
  39. #include "uprops.h"
  40. #include "charstr.h"
  41. #include "cstring.h"
  42. #include "mutex.h"
  43. #include "umutex.h"
  44. #include "uassert.h"
  45. #include "hash.h"
  46. U_NAMESPACE_USE
  47. namespace {
  48. // Special property set IDs
  49. constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
  50. constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
  51. constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
  52. // Unicode name property alias
  53. constexpr char16_t NAME_PROP[] = u"na";
  54. } // namespace
  55. // Cached sets ------------------------------------------------------------- ***
  56. U_CDECL_BEGIN
  57. static UBool U_CALLCONV uset_cleanup();
  58. static UnicodeSet *uni32Singleton;
  59. static icu::UInitOnce uni32InitOnce {};
  60. /**
  61. * Cleanup function for UnicodeSet
  62. */
  63. static UBool U_CALLCONV uset_cleanup() {
  64. delete uni32Singleton;
  65. uni32Singleton = nullptr;
  66. uni32InitOnce.reset();
  67. return true;
  68. }
  69. U_CDECL_END
  70. U_NAMESPACE_BEGIN
  71. namespace {
  72. // Cache some sets for other services -------------------------------------- ***
  73. void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
  74. U_ASSERT(uni32Singleton == nullptr);
  75. uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode);
  76. if(uni32Singleton==nullptr) {
  77. errorCode=U_MEMORY_ALLOCATION_ERROR;
  78. } else {
  79. uni32Singleton->freeze();
  80. }
  81. ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
  82. }
  83. U_CFUNC UnicodeSet *
  84. uniset_getUnicode32Instance(UErrorCode &errorCode) {
  85. umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
  86. return uni32Singleton;
  87. }
  88. // helper functions for matching of pattern syntax pieces ------------------ ***
  89. // these functions are parallel to the PERL_OPEN etc. strings above
  90. // using these functions is not only faster than UnicodeString::compare() and
  91. // caseCompare(), but they also make UnicodeSet work for simple patterns when
  92. // no Unicode properties data is available - when caseCompare() fails
  93. inline UBool
  94. isPerlOpen(const UnicodeString &pattern, int32_t pos) {
  95. char16_t c;
  96. return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
  97. }
  98. /*static inline UBool
  99. isPerlClose(const UnicodeString &pattern, int32_t pos) {
  100. return pattern.charAt(pos)==u'}';
  101. }*/
  102. inline UBool
  103. isNameOpen(const UnicodeString &pattern, int32_t pos) {
  104. return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
  105. }
  106. inline UBool
  107. isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
  108. return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
  109. }
  110. /*static inline UBool
  111. isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
  112. return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
  113. }*/
  114. // TODO memory debugging provided inside uniset.cpp
  115. // could be made available here but probably obsolete with use of modern
  116. // memory leak checker tools
  117. #define _dbgct(me)
  118. } // namespace
  119. //----------------------------------------------------------------
  120. // Constructors &c
  121. //----------------------------------------------------------------
  122. /**
  123. * Constructs a set from the given pattern, optionally ignoring
  124. * white space. See the class description for the syntax of the
  125. * pattern language.
  126. * @param pattern a string specifying what characters are in the set
  127. */
  128. UnicodeSet::UnicodeSet(const UnicodeString& pattern,
  129. UErrorCode& status) {
  130. applyPattern(pattern, status);
  131. _dbgct(this);
  132. }
  133. //----------------------------------------------------------------
  134. // Public API
  135. //----------------------------------------------------------------
  136. UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
  137. UErrorCode& status) {
  138. // Equivalent to
  139. // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
  140. // but without dependency on closeOver().
  141. ParsePosition pos(0);
  142. applyPatternIgnoreSpace(pattern, pos, nullptr, status);
  143. if (U_FAILURE(status)) return *this;
  144. int32_t i = pos.getIndex();
  145. // Skip over trailing whitespace
  146. ICU_Utility::skipWhitespace(pattern, i, true);
  147. if (i != pattern.length()) {
  148. status = U_ILLEGAL_ARGUMENT_ERROR;
  149. }
  150. return *this;
  151. }
  152. void
  153. UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
  154. ParsePosition& pos,
  155. const SymbolTable* symbols,
  156. UErrorCode& status) {
  157. if (U_FAILURE(status)) {
  158. return;
  159. }
  160. if (isFrozen()) {
  161. status = U_NO_WRITE_PERMISSION;
  162. return;
  163. }
  164. // Need to build the pattern in a temporary string because
  165. // _applyPattern calls add() etc., which set pat to empty.
  166. UnicodeString rebuiltPat;
  167. RuleCharacterIterator chars(pattern, symbols, pos);
  168. applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
  169. if (U_FAILURE(status)) return;
  170. if (chars.inVariable()) {
  171. // syntaxError(chars, "Extra chars in variable value");
  172. status = U_MALFORMED_SET;
  173. return;
  174. }
  175. setPattern(rebuiltPat);
  176. }
  177. /**
  178. * Return true if the given position, in the given pattern, appears
  179. * to be the start of a UnicodeSet pattern.
  180. */
  181. UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
  182. return ((pos+1) < pattern.length() &&
  183. pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) ||
  184. resemblesPropertyPattern(pattern, pos);
  185. }
  186. //----------------------------------------------------------------
  187. // Implementation: Pattern parsing
  188. //----------------------------------------------------------------
  189. namespace {
  190. /**
  191. * A small all-inline class to manage a UnicodeSet pointer. Add
  192. * operator->() etc. as needed.
  193. */
  194. class UnicodeSetPointer {
  195. UnicodeSet* p;
  196. public:
  197. inline UnicodeSetPointer() : p(nullptr) {}
  198. inline ~UnicodeSetPointer() { delete p; }
  199. inline UnicodeSet* pointer() { return p; }
  200. inline UBool allocate() {
  201. if (p == nullptr) {
  202. p = new UnicodeSet();
  203. }
  204. return p != nullptr;
  205. }
  206. };
  207. constexpr int32_t MAX_DEPTH = 100;
  208. } // namespace
  209. /**
  210. * Parse the pattern from the given RuleCharacterIterator. The
  211. * iterator is advanced over the parsed pattern.
  212. * @param chars iterator over the pattern characters. Upon return
  213. * it will be advanced to the first character after the parsed
  214. * pattern, or the end of the iteration if all characters are
  215. * parsed.
  216. * @param symbols symbol table to use to parse and dereference
  217. * variables, or null if none.
  218. * @param rebuiltPat the pattern that was parsed, rebuilt or
  219. * copied from the input pattern, as appropriate.
  220. * @param options a bit mask of zero or more of the following:
  221. * IGNORE_SPACE, CASE.
  222. */
  223. void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
  224. const SymbolTable* symbols,
  225. UnicodeString& rebuiltPat,
  226. uint32_t options,
  227. UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
  228. int32_t depth,
  229. UErrorCode& ec) {
  230. if (U_FAILURE(ec)) return;
  231. if (depth > MAX_DEPTH) {
  232. ec = U_ILLEGAL_ARGUMENT_ERROR;
  233. return;
  234. }
  235. // Syntax characters: [ ] ^ - & { }
  236. // Recognized special forms for chars, sets: c-c s-s s&s
  237. int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
  238. RuleCharacterIterator::PARSE_ESCAPES;
  239. if ((options & USET_IGNORE_SPACE) != 0) {
  240. opts |= RuleCharacterIterator::SKIP_WHITESPACE;
  241. }
  242. UnicodeString patLocal, buf;
  243. UBool usePat = false;
  244. UnicodeSetPointer scratch;
  245. RuleCharacterIterator::Pos backup;
  246. // mode: 0=before [, 1=between [...], 2=after ]
  247. // lastItem: 0=none, 1=char, 2=set
  248. int8_t lastItem = 0, mode = 0;
  249. UChar32 lastChar = 0;
  250. char16_t op = 0;
  251. UBool invert = false;
  252. clear();
  253. while (mode != 2 && !chars.atEnd()) {
  254. U_ASSERT((lastItem == 0 && op == 0) ||
  255. (lastItem == 1 && (op == 0 || op == u'-')) ||
  256. (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
  257. UChar32 c = 0;
  258. UBool literal = false;
  259. UnicodeSet* nested = nullptr; // alias - do not delete
  260. // -------- Check for property pattern
  261. // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
  262. int8_t setMode = 0;
  263. if (resemblesPropertyPattern(chars, opts)) {
  264. setMode = 2;
  265. }
  266. // -------- Parse '[' of opening delimiter OR nested set.
  267. // If there is a nested set, use `setMode' to define how
  268. // the set should be parsed. If the '[' is part of the
  269. // opening delimiter for this pattern, parse special
  270. // strings "[", "[^", "[-", and "[^-". Check for stand-in
  271. // characters representing a nested set in the symbol
  272. // table.
  273. else {
  274. // Prepare to backup if necessary
  275. chars.getPos(backup);
  276. c = chars.next(opts, literal, ec);
  277. if (U_FAILURE(ec)) return;
  278. if (c == u'[' && !literal) {
  279. if (mode == 1) {
  280. chars.setPos(backup); // backup
  281. setMode = 1;
  282. } else {
  283. // Handle opening '[' delimiter
  284. mode = 1;
  285. patLocal.append(u'[');
  286. chars.getPos(backup); // prepare to backup
  287. c = chars.next(opts, literal, ec);
  288. if (U_FAILURE(ec)) return;
  289. if (c == u'^' && !literal) {
  290. invert = true;
  291. patLocal.append(u'^');
  292. chars.getPos(backup); // prepare to backup
  293. c = chars.next(opts, literal, ec);
  294. if (U_FAILURE(ec)) return;
  295. }
  296. // Fall through to handle special leading '-';
  297. // otherwise restart loop for nested [], \p{}, etc.
  298. if (c == u'-') {
  299. literal = true;
  300. // Fall through to handle literal '-' below
  301. } else {
  302. chars.setPos(backup); // backup
  303. continue;
  304. }
  305. }
  306. } else if (symbols != nullptr) {
  307. const UnicodeFunctor *m = symbols->lookupMatcher(c);
  308. if (m != nullptr) {
  309. const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
  310. if (ms == nullptr) {
  311. ec = U_MALFORMED_SET;
  312. return;
  313. }
  314. // casting away const, but `nested' won't be modified
  315. // (important not to modify stored set)
  316. nested = const_cast<UnicodeSet*>(ms);
  317. setMode = 3;
  318. }
  319. }
  320. }
  321. // -------- Handle a nested set. This either is inline in
  322. // the pattern or represented by a stand-in that has
  323. // previously been parsed and was looked up in the symbol
  324. // table.
  325. if (setMode != 0) {
  326. if (lastItem == 1) {
  327. if (op != 0) {
  328. // syntaxError(chars, "Char expected after operator");
  329. ec = U_MALFORMED_SET;
  330. return;
  331. }
  332. add(lastChar, lastChar);
  333. _appendToPat(patLocal, lastChar, false);
  334. lastItem = 0;
  335. op = 0;
  336. }
  337. if (op == u'-' || op == u'&') {
  338. patLocal.append(op);
  339. }
  340. if (nested == nullptr) {
  341. // lazy allocation
  342. if (!scratch.allocate()) {
  343. ec = U_MEMORY_ALLOCATION_ERROR;
  344. return;
  345. }
  346. nested = scratch.pointer();
  347. }
  348. switch (setMode) {
  349. case 1:
  350. nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
  351. break;
  352. case 2:
  353. chars.skipIgnored(opts);
  354. nested->applyPropertyPattern(chars, patLocal, ec);
  355. if (U_FAILURE(ec)) return;
  356. break;
  357. case 3: // `nested' already parsed
  358. nested->_toPattern(patLocal, false);
  359. break;
  360. }
  361. usePat = true;
  362. if (mode == 0) {
  363. // Entire pattern is a category; leave parse loop
  364. *this = *nested;
  365. mode = 2;
  366. break;
  367. }
  368. switch (op) {
  369. case u'-':
  370. removeAll(*nested);
  371. break;
  372. case u'&':
  373. retainAll(*nested);
  374. break;
  375. case 0:
  376. addAll(*nested);
  377. break;
  378. }
  379. op = 0;
  380. lastItem = 2;
  381. continue;
  382. }
  383. if (mode == 0) {
  384. // syntaxError(chars, "Missing '['");
  385. ec = U_MALFORMED_SET;
  386. return;
  387. }
  388. // -------- Parse special (syntax) characters. If the
  389. // current character is not special, or if it is escaped,
  390. // then fall through and handle it below.
  391. if (!literal) {
  392. switch (c) {
  393. case u']':
  394. if (lastItem == 1) {
  395. add(lastChar, lastChar);
  396. _appendToPat(patLocal, lastChar, false);
  397. }
  398. // Treat final trailing '-' as a literal
  399. if (op == u'-') {
  400. add(op, op);
  401. patLocal.append(op);
  402. } else if (op == u'&') {
  403. // syntaxError(chars, "Trailing '&'");
  404. ec = U_MALFORMED_SET;
  405. return;
  406. }
  407. patLocal.append(u']');
  408. mode = 2;
  409. continue;
  410. case u'-':
  411. if (op == 0) {
  412. if (lastItem != 0) {
  413. op = static_cast<char16_t>(c);
  414. continue;
  415. } else {
  416. // Treat final trailing '-' as a literal
  417. add(c, c);
  418. c = chars.next(opts, literal, ec);
  419. if (U_FAILURE(ec)) return;
  420. if (c == u']' && !literal) {
  421. patLocal.append(u"-]", 2);
  422. mode = 2;
  423. continue;
  424. }
  425. }
  426. }
  427. // syntaxError(chars, "'-' not after char or set");
  428. ec = U_MALFORMED_SET;
  429. return;
  430. case u'&':
  431. if (lastItem == 2 && op == 0) {
  432. op = static_cast<char16_t>(c);
  433. continue;
  434. }
  435. // syntaxError(chars, "'&' not after set");
  436. ec = U_MALFORMED_SET;
  437. return;
  438. case u'^':
  439. // syntaxError(chars, "'^' not after '['");
  440. ec = U_MALFORMED_SET;
  441. return;
  442. case u'{':
  443. if (op != 0) {
  444. // syntaxError(chars, "Missing operand after operator");
  445. ec = U_MALFORMED_SET;
  446. return;
  447. }
  448. if (lastItem == 1) {
  449. add(lastChar, lastChar);
  450. _appendToPat(patLocal, lastChar, false);
  451. }
  452. lastItem = 0;
  453. buf.truncate(0);
  454. {
  455. UBool ok = false;
  456. while (!chars.atEnd()) {
  457. c = chars.next(opts, literal, ec);
  458. if (U_FAILURE(ec)) return;
  459. if (c == u'}' && !literal) {
  460. ok = true;
  461. break;
  462. }
  463. buf.append(c);
  464. }
  465. if (!ok) {
  466. // syntaxError(chars, "Invalid multicharacter string");
  467. ec = U_MALFORMED_SET;
  468. return;
  469. }
  470. }
  471. // We have new string. Add it to set and continue;
  472. // we don't need to drop through to the further
  473. // processing
  474. add(buf);
  475. patLocal.append(u'{');
  476. _appendToPat(patLocal, buf, false);
  477. patLocal.append(u'}');
  478. continue;
  479. case SymbolTable::SYMBOL_REF:
  480. // symbols nosymbols
  481. // [a-$] error error (ambiguous)
  482. // [a$] anchor anchor
  483. // [a-$x] var "x"* literal '$'
  484. // [a-$.] error literal '$'
  485. // *We won't get here in the case of var "x"
  486. {
  487. chars.getPos(backup);
  488. c = chars.next(opts, literal, ec);
  489. if (U_FAILURE(ec)) return;
  490. UBool anchor = (c == u']' && !literal);
  491. if (symbols == nullptr && !anchor) {
  492. c = SymbolTable::SYMBOL_REF;
  493. chars.setPos(backup);
  494. break; // literal '$'
  495. }
  496. if (anchor && op == 0) {
  497. if (lastItem == 1) {
  498. add(lastChar, lastChar);
  499. _appendToPat(patLocal, lastChar, false);
  500. }
  501. add(U_ETHER);
  502. usePat = true;
  503. patLocal.append(static_cast<char16_t>(SymbolTable::SYMBOL_REF));
  504. patLocal.append(u']');
  505. mode = 2;
  506. continue;
  507. }
  508. // syntaxError(chars, "Unquoted '$'");
  509. ec = U_MALFORMED_SET;
  510. return;
  511. }
  512. default:
  513. break;
  514. }
  515. }
  516. // -------- Parse literal characters. This includes both
  517. // escaped chars ("\u4E01") and non-syntax characters
  518. // ("a").
  519. switch (lastItem) {
  520. case 0:
  521. lastItem = 1;
  522. lastChar = c;
  523. break;
  524. case 1:
  525. if (op == u'-') {
  526. if (lastChar >= c) {
  527. // Don't allow redundant (a-a) or empty (b-a) ranges;
  528. // these are most likely typos.
  529. // syntaxError(chars, "Invalid range");
  530. ec = U_MALFORMED_SET;
  531. return;
  532. }
  533. add(lastChar, c);
  534. _appendToPat(patLocal, lastChar, false);
  535. patLocal.append(op);
  536. _appendToPat(patLocal, c, false);
  537. lastItem = 0;
  538. op = 0;
  539. } else {
  540. add(lastChar, lastChar);
  541. _appendToPat(patLocal, lastChar, false);
  542. lastChar = c;
  543. }
  544. break;
  545. case 2:
  546. if (op != 0) {
  547. // syntaxError(chars, "Set expected after operator");
  548. ec = U_MALFORMED_SET;
  549. return;
  550. }
  551. lastChar = c;
  552. lastItem = 1;
  553. break;
  554. }
  555. }
  556. if (mode != 2) {
  557. // syntaxError(chars, "Missing ']'");
  558. ec = U_MALFORMED_SET;
  559. return;
  560. }
  561. chars.skipIgnored(opts);
  562. /**
  563. * Handle global flags (invert, case insensitivity). If this
  564. * pattern should be compiled case-insensitive, then we need
  565. * to close over case BEFORE COMPLEMENTING. This makes
  566. * patterns like /[^abc]/i work.
  567. */
  568. if ((options & USET_CASE_MASK) != 0) {
  569. (this->*caseClosure)(options);
  570. }
  571. if (invert) {
  572. complement().removeAllStrings(); // code point complement
  573. }
  574. // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
  575. // generated pattern.
  576. if (usePat) {
  577. rebuiltPat.append(patLocal);
  578. } else {
  579. _generatePattern(rebuiltPat, false);
  580. }
  581. if (isBogus() && U_SUCCESS(ec)) {
  582. // We likely ran out of memory. AHHH!
  583. ec = U_MEMORY_ALLOCATION_ERROR;
  584. }
  585. }
  586. //----------------------------------------------------------------
  587. // Property set implementation
  588. //----------------------------------------------------------------
  589. namespace {
  590. UBool numericValueFilter(UChar32 ch, void* context) {
  591. return u_getNumericValue(ch) == *static_cast<double*>(context);
  592. }
  593. UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
  594. int32_t value = *static_cast<int32_t*>(context);
  595. return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
  596. }
  597. UBool versionFilter(UChar32 ch, void* context) {
  598. static const UVersionInfo none = { 0, 0, 0, 0 };
  599. UVersionInfo v;
  600. u_charAge(ch, v);
  601. UVersionInfo* version = static_cast<UVersionInfo*>(context);
  602. return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
  603. }
  604. typedef struct {
  605. UProperty prop;
  606. int32_t value;
  607. } IntPropertyContext;
  608. UBool intPropertyFilter(UChar32 ch, void* context) {
  609. IntPropertyContext* c = static_cast<IntPropertyContext*>(context);
  610. return u_getIntPropertyValue(ch, c->prop) == c->value;
  611. }
  612. UBool scriptExtensionsFilter(UChar32 ch, void* context) {
  613. return uscript_hasScript(ch, *static_cast<UScriptCode*>(context));
  614. }
  615. UBool idTypeFilter(UChar32 ch, void* context) {
  616. return u_hasIDType(ch, *static_cast<UIdentifierType*>(context));
  617. }
  618. } // namespace
  619. /**
  620. * Generic filter-based scanning code for UCD property UnicodeSets.
  621. */
  622. void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
  623. void* context,
  624. const UnicodeSet* inclusions,
  625. UErrorCode &status) {
  626. if (U_FAILURE(status)) return;
  627. // Logically, walk through all Unicode characters, noting the start
  628. // and end of each range for which filter.contain(c) is
  629. // true. Add each range to a set.
  630. //
  631. // To improve performance, use an inclusions set which
  632. // encodes information about character ranges that are known
  633. // to have identical properties.
  634. // inclusions contains the first characters of
  635. // same-value ranges for the given property.
  636. clear();
  637. UChar32 startHasProperty = -1;
  638. int32_t limitRange = inclusions->getRangeCount();
  639. for (int j=0; j<limitRange; ++j) {
  640. // get current range
  641. UChar32 start = inclusions->getRangeStart(j);
  642. UChar32 end = inclusions->getRangeEnd(j);
  643. // for all the code points in the range, process
  644. for (UChar32 ch = start; ch <= end; ++ch) {
  645. // only add to this UnicodeSet on inflection points --
  646. // where the hasProperty value changes to false
  647. if ((*filter)(ch, context)) {
  648. if (startHasProperty < 0) {
  649. startHasProperty = ch;
  650. }
  651. } else if (startHasProperty >= 0) {
  652. add(startHasProperty, ch-1);
  653. startHasProperty = -1;
  654. }
  655. }
  656. }
  657. if (startHasProperty >= 0) {
  658. add(startHasProperty, static_cast<UChar32>(0x10FFFF));
  659. }
  660. if (isBogus() && U_SUCCESS(status)) {
  661. // We likely ran out of memory. AHHH!
  662. status = U_MEMORY_ALLOCATION_ERROR;
  663. }
  664. }
  665. namespace {
  666. UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
  667. /* Note: we use ' ' in compiler code page */
  668. int32_t j = 0;
  669. char ch;
  670. --dstCapacity; /* make room for term. zero */
  671. while ((ch = *src++) != 0) {
  672. if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
  673. continue;
  674. }
  675. if (j >= dstCapacity) return false;
  676. dst[j++] = ch;
  677. }
  678. if (j > 0 && dst[j-1] == ' ') --j;
  679. dst[j] = 0;
  680. return true;
  681. }
  682. } // namespace
  683. //----------------------------------------------------------------
  684. // Property set API
  685. //----------------------------------------------------------------
  686. #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
  687. ec=U_ILLEGAL_ARGUMENT_ERROR; \
  688. return *this; \
  689. } UPRV_BLOCK_MACRO_END
  690. UnicodeSet&
  691. UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
  692. if (U_FAILURE(ec) || isFrozen()) { return *this; }
  693. if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
  694. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  695. applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
  696. } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
  697. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  698. UScriptCode script = static_cast<UScriptCode>(value);
  699. applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
  700. } else if (prop == UCHAR_IDENTIFIER_TYPE) {
  701. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  702. UIdentifierType idType = static_cast<UIdentifierType>(value);
  703. applyFilter(idTypeFilter, &idType, inclusions, ec);
  704. } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
  705. if (value == 0 || value == 1) {
  706. const USet *set = u_getBinaryPropertySet(prop, &ec);
  707. if (U_FAILURE(ec)) { return *this; }
  708. copyFrom(*UnicodeSet::fromUSet(set), true);
  709. if (value == 0) {
  710. complement().removeAllStrings(); // code point complement
  711. }
  712. } else {
  713. clear();
  714. }
  715. } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
  716. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  717. IntPropertyContext c = {prop, value};
  718. applyFilter(intPropertyFilter, &c, inclusions, ec);
  719. } else {
  720. ec = U_ILLEGAL_ARGUMENT_ERROR;
  721. }
  722. return *this;
  723. }
  724. UnicodeSet&
  725. UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
  726. const UnicodeString& value,
  727. UErrorCode& ec) {
  728. if (U_FAILURE(ec) || isFrozen()) return *this;
  729. // prop and value used to be converted to char * using the default
  730. // converter instead of the invariant conversion.
  731. // This should not be necessary because all Unicode property and value
  732. // names use only invariant characters.
  733. // If there are any variant characters, then we won't find them anyway.
  734. // Checking first avoids assertion failures in the conversion.
  735. if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
  736. !uprv_isInvariantUString(value.getBuffer(), value.length())
  737. ) {
  738. FAIL(ec);
  739. }
  740. CharString pname, vname;
  741. pname.appendInvariantChars(prop, ec);
  742. vname.appendInvariantChars(value, ec);
  743. if (U_FAILURE(ec)) return *this;
  744. UProperty p;
  745. int32_t v;
  746. UBool invert = false;
  747. if (value.length() > 0) {
  748. p = u_getPropertyEnum(pname.data());
  749. if (p == UCHAR_INVALID_CODE) FAIL(ec);
  750. // Treat gc as gcm
  751. if (p == UCHAR_GENERAL_CATEGORY) {
  752. p = UCHAR_GENERAL_CATEGORY_MASK;
  753. }
  754. if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
  755. (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
  756. (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
  757. v = u_getPropertyValueEnum(p, vname.data());
  758. if (v == UCHAR_INVALID_CODE) {
  759. // Handle numeric CCC
  760. if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
  761. p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
  762. p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
  763. char* end;
  764. double val = uprv_strtod(vname.data(), &end);
  765. // Anything between 0 and 255 is valid even if unused.
  766. // Cast double->int only after range check.
  767. // We catch NaN here because comparing it with both 0 and 255 will be false
  768. // (as are all comparisons with NaN).
  769. if (*end != 0 || !(0 <= val && val <= 255) ||
  770. (v = static_cast<int32_t>(val)) != val) {
  771. // non-integral value or outside 0..255, or trailing junk
  772. FAIL(ec);
  773. }
  774. } else {
  775. FAIL(ec);
  776. }
  777. }
  778. }
  779. else {
  780. switch (p) {
  781. case UCHAR_NUMERIC_VALUE:
  782. {
  783. char* end;
  784. double val = uprv_strtod(vname.data(), &end);
  785. if (*end != 0) {
  786. FAIL(ec);
  787. }
  788. applyFilter(numericValueFilter, &val,
  789. CharacterProperties::getInclusionsForProperty(p, ec), ec);
  790. return *this;
  791. }
  792. case UCHAR_NAME:
  793. {
  794. // Must munge name, since u_charFromName() does not do
  795. // 'loose' matching.
  796. char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
  797. if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  798. UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
  799. if (U_SUCCESS(ec)) {
  800. clear();
  801. add(ch);
  802. return *this;
  803. } else {
  804. FAIL(ec);
  805. }
  806. }
  807. case UCHAR_UNICODE_1_NAME:
  808. // ICU 49 deprecates the Unicode_1_Name property APIs.
  809. FAIL(ec);
  810. case UCHAR_AGE:
  811. {
  812. // Must munge name, since u_versionFromString() does not do
  813. // 'loose' matching.
  814. char buf[128];
  815. if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  816. UVersionInfo version;
  817. u_versionFromString(version, buf);
  818. applyFilter(versionFilter, &version,
  819. CharacterProperties::getInclusionsForProperty(p, ec), ec);
  820. return *this;
  821. }
  822. case UCHAR_SCRIPT_EXTENSIONS:
  823. v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
  824. if (v == UCHAR_INVALID_CODE) {
  825. FAIL(ec);
  826. }
  827. // fall through to calling applyIntPropertyValue()
  828. break;
  829. case UCHAR_IDENTIFIER_TYPE:
  830. v = u_getPropertyValueEnum(p, vname.data());
  831. if (v == UCHAR_INVALID_CODE) {
  832. FAIL(ec);
  833. }
  834. // fall through to calling applyIntPropertyValue()
  835. break;
  836. default:
  837. // p is a non-binary, non-enumerated property that we
  838. // don't support (yet).
  839. FAIL(ec);
  840. }
  841. }
  842. }
  843. else {
  844. // value is empty. Interpret as General Category, Script, or
  845. // Binary property.
  846. p = UCHAR_GENERAL_CATEGORY_MASK;
  847. v = u_getPropertyValueEnum(p, pname.data());
  848. if (v == UCHAR_INVALID_CODE) {
  849. p = UCHAR_SCRIPT;
  850. v = u_getPropertyValueEnum(p, pname.data());
  851. if (v == UCHAR_INVALID_CODE) {
  852. p = u_getPropertyEnum(pname.data());
  853. if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
  854. v = 1;
  855. } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
  856. set(MIN_VALUE, MAX_VALUE);
  857. return *this;
  858. } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
  859. set(0, 0x7F);
  860. return *this;
  861. } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
  862. // [:Assigned:]=[:^Cn:]
  863. p = UCHAR_GENERAL_CATEGORY_MASK;
  864. v = U_GC_CN_MASK;
  865. invert = true;
  866. } else {
  867. FAIL(ec);
  868. }
  869. }
  870. }
  871. }
  872. applyIntPropertyValue(p, v, ec);
  873. if(invert) {
  874. complement().removeAllStrings(); // code point complement
  875. }
  876. if (isBogus() && U_SUCCESS(ec)) {
  877. // We likely ran out of memory. AHHH!
  878. ec = U_MEMORY_ALLOCATION_ERROR;
  879. }
  880. return *this;
  881. }
  882. //----------------------------------------------------------------
  883. // Property set patterns
  884. //----------------------------------------------------------------
  885. /**
  886. * Return true if the given position, in the given pattern, appears
  887. * to be the start of a property set pattern.
  888. */
  889. UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
  890. int32_t pos) {
  891. // Patterns are at least 5 characters long
  892. if ((pos+5) > pattern.length()) {
  893. return false;
  894. }
  895. // Look for an opening [:, [:^, \p, or \P
  896. return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
  897. }
  898. /**
  899. * Return true if the given iterator appears to point at a
  900. * property pattern. Regardless of the result, return with the
  901. * iterator unchanged.
  902. * @param chars iterator over the pattern characters. Upon return
  903. * it will be unchanged.
  904. * @param iterOpts RuleCharacterIterator options
  905. */
  906. UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
  907. int32_t iterOpts) {
  908. // NOTE: literal will always be false, because we don't parse escapes.
  909. UBool result = false, literal;
  910. UErrorCode ec = U_ZERO_ERROR;
  911. iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
  912. RuleCharacterIterator::Pos pos;
  913. chars.getPos(pos);
  914. UChar32 c = chars.next(iterOpts, literal, ec);
  915. if (c == u'[' || c == u'\\') {
  916. UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
  917. literal, ec);
  918. result = (c == u'[') ? (d == u':') :
  919. (d == u'N' || d == u'p' || d == u'P');
  920. }
  921. chars.setPos(pos);
  922. return result && U_SUCCESS(ec);
  923. }
  924. /**
  925. * Parse the given property pattern at the given parse position.
  926. */
  927. UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
  928. ParsePosition& ppos,
  929. UErrorCode &ec) {
  930. int32_t pos = ppos.getIndex();
  931. UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
  932. UBool isName = false; // true for \N{pat}, o/w false
  933. UBool invert = false;
  934. if (U_FAILURE(ec)) return *this;
  935. // Minimum length is 5 characters, e.g. \p{L}
  936. if ((pos+5) > pattern.length()) {
  937. FAIL(ec);
  938. }
  939. // On entry, ppos should point to one of the following locations:
  940. // Look for an opening [:, [:^, \p, or \P
  941. if (isPOSIXOpen(pattern, pos)) {
  942. posix = true;
  943. pos += 2;
  944. pos = ICU_Utility::skipWhitespace(pattern, pos);
  945. if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
  946. ++pos;
  947. invert = true;
  948. }
  949. } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
  950. char16_t c = pattern.charAt(pos+1);
  951. invert = (c == u'P');
  952. isName = (c == u'N');
  953. pos += 2;
  954. pos = ICU_Utility::skipWhitespace(pattern, pos);
  955. if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
  956. // Syntax error; "\p" or "\P" not followed by "{"
  957. FAIL(ec);
  958. }
  959. } else {
  960. // Open delimiter not seen
  961. FAIL(ec);
  962. }
  963. // Look for the matching close delimiter, either :] or }
  964. int32_t close;
  965. if (posix) {
  966. close = pattern.indexOf(u":]", 2, pos);
  967. } else {
  968. close = pattern.indexOf(u'}', pos);
  969. }
  970. if (close < 0) {
  971. // Syntax error; close delimiter missing
  972. FAIL(ec);
  973. }
  974. // Look for an '=' sign. If this is present, we will parse a
  975. // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
  976. // pattern.
  977. int32_t equals = pattern.indexOf(u'=', pos);
  978. UnicodeString propName, valueName;
  979. if (equals >= 0 && equals < close && !isName) {
  980. // Equals seen; parse medium/long pattern
  981. pattern.extractBetween(pos, equals, propName);
  982. pattern.extractBetween(equals+1, close, valueName);
  983. }
  984. else {
  985. // Handle case where no '=' is seen, and \N{}
  986. pattern.extractBetween(pos, close, propName);
  987. // Handle \N{name}
  988. if (isName) {
  989. // This is a little inefficient since it means we have to
  990. // parse NAME_PROP back to UCHAR_NAME even though we already
  991. // know it's UCHAR_NAME. If we refactor the API to
  992. // support args of (UProperty, char*) then we can remove
  993. // NAME_PROP and make this a little more efficient.
  994. valueName = propName;
  995. propName = NAME_PROP;
  996. }
  997. }
  998. applyPropertyAlias(propName, valueName, ec);
  999. if (U_SUCCESS(ec)) {
  1000. if (invert) {
  1001. complement().removeAllStrings(); // code point complement
  1002. }
  1003. // Move to the limit position after the close delimiter if the
  1004. // parse succeeded.
  1005. ppos.setIndex(close + (posix ? 2 : 1));
  1006. }
  1007. return *this;
  1008. }
  1009. /**
  1010. * Parse a property pattern.
  1011. * @param chars iterator over the pattern characters. Upon return
  1012. * it will be advanced to the first character after the parsed
  1013. * pattern, or the end of the iteration if all characters are
  1014. * parsed.
  1015. * @param rebuiltPat the pattern that was parsed, rebuilt or
  1016. * copied from the input pattern, as appropriate.
  1017. */
  1018. void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
  1019. UnicodeString& rebuiltPat,
  1020. UErrorCode& ec) {
  1021. if (U_FAILURE(ec)) return;
  1022. UnicodeString pattern;
  1023. chars.lookahead(pattern);
  1024. ParsePosition pos(0);
  1025. applyPropertyPattern(pattern, pos, ec);
  1026. if (U_FAILURE(ec)) return;
  1027. if (pos.getIndex() == 0) {
  1028. // syntaxError(chars, "Invalid property pattern");
  1029. ec = U_MALFORMED_SET;
  1030. return;
  1031. }
  1032. chars.jumpahead(pos.getIndex());
  1033. rebuiltPat.append(pattern, 0, pos.getIndex());
  1034. }
  1035. U_NAMESPACE_END