uniset_props.cpp 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1999-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uniset_props.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2004aug25
  16. * created by: Markus W. Scherer
  17. *
  18. * Character property dependent functions moved here from uniset.cpp
  19. */
  20. #include "unicode/utypes.h"
  21. #include "unicode/uniset.h"
  22. #include "unicode/parsepos.h"
  23. #include "unicode/uchar.h"
  24. #include "unicode/uscript.h"
  25. #include "unicode/symtable.h"
  26. #include "unicode/uset.h"
  27. #include "unicode/locid.h"
  28. #include "unicode/brkiter.h"
  29. #include "uset_imp.h"
  30. #include "ruleiter.h"
  31. #include "cmemory.h"
  32. #include "ucln_cmn.h"
  33. #include "util.h"
  34. #include "uvector.h"
  35. #include "uprops.h"
  36. #include "propname.h"
  37. #include "normalizer2impl.h"
  38. #include "uinvchar.h"
  39. #include "uprops.h"
  40. #include "charstr.h"
  41. #include "cstring.h"
  42. #include "mutex.h"
  43. #include "umutex.h"
  44. #include "uassert.h"
  45. #include "hash.h"
  46. U_NAMESPACE_USE
  47. // Special property set IDs
  48. static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
  49. static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
  50. static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
  51. // Unicode name property alias
  52. #define NAME_PROP "na"
  53. #define NAME_PROP_LENGTH 2
  54. // Cached sets ------------------------------------------------------------- ***
  55. U_CDECL_BEGIN
  56. static UBool U_CALLCONV uset_cleanup();
  57. static UnicodeSet *uni32Singleton;
  58. static icu::UInitOnce uni32InitOnce {};
  59. /**
  60. * Cleanup function for UnicodeSet
  61. */
  62. static UBool U_CALLCONV uset_cleanup() {
  63. delete uni32Singleton;
  64. uni32Singleton = nullptr;
  65. uni32InitOnce.reset();
  66. return true;
  67. }
  68. U_CDECL_END
  69. U_NAMESPACE_BEGIN
  70. namespace {
  71. // Cache some sets for other services -------------------------------------- ***
  72. void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
  73. U_ASSERT(uni32Singleton == nullptr);
  74. uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
  75. if(uni32Singleton==nullptr) {
  76. errorCode=U_MEMORY_ALLOCATION_ERROR;
  77. } else {
  78. uni32Singleton->freeze();
  79. }
  80. ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
  81. }
  82. U_CFUNC UnicodeSet *
  83. uniset_getUnicode32Instance(UErrorCode &errorCode) {
  84. umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
  85. return uni32Singleton;
  86. }
  87. // helper functions for matching of pattern syntax pieces ------------------ ***
  88. // these functions are parallel to the PERL_OPEN etc. strings above
  89. // using these functions is not only faster than UnicodeString::compare() and
  90. // caseCompare(), but they also make UnicodeSet work for simple patterns when
  91. // no Unicode properties data is available - when caseCompare() fails
  92. static inline UBool
  93. isPerlOpen(const UnicodeString &pattern, int32_t pos) {
  94. char16_t c;
  95. return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
  96. }
  97. /*static inline UBool
  98. isPerlClose(const UnicodeString &pattern, int32_t pos) {
  99. return pattern.charAt(pos)==u'}';
  100. }*/
  101. static inline UBool
  102. isNameOpen(const UnicodeString &pattern, int32_t pos) {
  103. return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
  104. }
  105. static inline UBool
  106. isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
  107. return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
  108. }
  109. /*static inline UBool
  110. isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
  111. return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
  112. }*/
  113. // TODO memory debugging provided inside uniset.cpp
  114. // could be made available here but probably obsolete with use of modern
  115. // memory leak checker tools
  116. #define _dbgct(me)
  117. } // namespace
  118. //----------------------------------------------------------------
  119. // Constructors &c
  120. //----------------------------------------------------------------
  121. /**
  122. * Constructs a set from the given pattern, optionally ignoring
  123. * white space. See the class description for the syntax of the
  124. * pattern language.
  125. * @param pattern a string specifying what characters are in the set
  126. */
  127. UnicodeSet::UnicodeSet(const UnicodeString& pattern,
  128. UErrorCode& status) {
  129. applyPattern(pattern, status);
  130. _dbgct(this);
  131. }
  132. //----------------------------------------------------------------
  133. // Public API
  134. //----------------------------------------------------------------
  135. UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
  136. UErrorCode& status) {
  137. // Equivalent to
  138. // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
  139. // but without dependency on closeOver().
  140. ParsePosition pos(0);
  141. applyPatternIgnoreSpace(pattern, pos, nullptr, status);
  142. if (U_FAILURE(status)) return *this;
  143. int32_t i = pos.getIndex();
  144. // Skip over trailing whitespace
  145. ICU_Utility::skipWhitespace(pattern, i, true);
  146. if (i != pattern.length()) {
  147. status = U_ILLEGAL_ARGUMENT_ERROR;
  148. }
  149. return *this;
  150. }
  151. void
  152. UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
  153. ParsePosition& pos,
  154. const SymbolTable* symbols,
  155. UErrorCode& status) {
  156. if (U_FAILURE(status)) {
  157. return;
  158. }
  159. if (isFrozen()) {
  160. status = U_NO_WRITE_PERMISSION;
  161. return;
  162. }
  163. // Need to build the pattern in a temporary string because
  164. // _applyPattern calls add() etc., which set pat to empty.
  165. UnicodeString rebuiltPat;
  166. RuleCharacterIterator chars(pattern, symbols, pos);
  167. applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
  168. if (U_FAILURE(status)) return;
  169. if (chars.inVariable()) {
  170. // syntaxError(chars, "Extra chars in variable value");
  171. status = U_MALFORMED_SET;
  172. return;
  173. }
  174. setPattern(rebuiltPat);
  175. }
  176. /**
  177. * Return true if the given position, in the given pattern, appears
  178. * to be the start of a UnicodeSet pattern.
  179. */
  180. UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
  181. return ((pos+1) < pattern.length() &&
  182. pattern.charAt(pos) == (char16_t)91/*[*/) ||
  183. resemblesPropertyPattern(pattern, pos);
  184. }
  185. //----------------------------------------------------------------
  186. // Implementation: Pattern parsing
  187. //----------------------------------------------------------------
  188. namespace {
  189. /**
  190. * A small all-inline class to manage a UnicodeSet pointer. Add
  191. * operator->() etc. as needed.
  192. */
  193. class UnicodeSetPointer {
  194. UnicodeSet* p;
  195. public:
  196. inline UnicodeSetPointer() : p(0) {}
  197. inline ~UnicodeSetPointer() { delete p; }
  198. inline UnicodeSet* pointer() { return p; }
  199. inline UBool allocate() {
  200. if (p == 0) {
  201. p = new UnicodeSet();
  202. }
  203. return p != 0;
  204. }
  205. };
  206. constexpr int32_t MAX_DEPTH = 100;
  207. } // namespace
  208. /**
  209. * Parse the pattern from the given RuleCharacterIterator. The
  210. * iterator is advanced over the parsed pattern.
  211. * @param chars iterator over the pattern characters. Upon return
  212. * it will be advanced to the first character after the parsed
  213. * pattern, or the end of the iteration if all characters are
  214. * parsed.
  215. * @param symbols symbol table to use to parse and dereference
  216. * variables, or null if none.
  217. * @param rebuiltPat the pattern that was parsed, rebuilt or
  218. * copied from the input pattern, as appropriate.
  219. * @param options a bit mask of zero or more of the following:
  220. * IGNORE_SPACE, CASE.
  221. */
  222. void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
  223. const SymbolTable* symbols,
  224. UnicodeString& rebuiltPat,
  225. uint32_t options,
  226. UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
  227. int32_t depth,
  228. UErrorCode& ec) {
  229. if (U_FAILURE(ec)) return;
  230. if (depth > MAX_DEPTH) {
  231. ec = U_ILLEGAL_ARGUMENT_ERROR;
  232. return;
  233. }
  234. // Syntax characters: [ ] ^ - & { }
  235. // Recognized special forms for chars, sets: c-c s-s s&s
  236. int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
  237. RuleCharacterIterator::PARSE_ESCAPES;
  238. if ((options & USET_IGNORE_SPACE) != 0) {
  239. opts |= RuleCharacterIterator::SKIP_WHITESPACE;
  240. }
  241. UnicodeString patLocal, buf;
  242. UBool usePat = false;
  243. UnicodeSetPointer scratch;
  244. RuleCharacterIterator::Pos backup;
  245. // mode: 0=before [, 1=between [...], 2=after ]
  246. // lastItem: 0=none, 1=char, 2=set
  247. int8_t lastItem = 0, mode = 0;
  248. UChar32 lastChar = 0;
  249. char16_t op = 0;
  250. UBool invert = false;
  251. clear();
  252. while (mode != 2 && !chars.atEnd()) {
  253. U_ASSERT((lastItem == 0 && op == 0) ||
  254. (lastItem == 1 && (op == 0 || op == u'-')) ||
  255. (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
  256. UChar32 c = 0;
  257. UBool literal = false;
  258. UnicodeSet* nested = 0; // alias - do not delete
  259. // -------- Check for property pattern
  260. // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
  261. int8_t setMode = 0;
  262. if (resemblesPropertyPattern(chars, opts)) {
  263. setMode = 2;
  264. }
  265. // -------- Parse '[' of opening delimiter OR nested set.
  266. // If there is a nested set, use `setMode' to define how
  267. // the set should be parsed. If the '[' is part of the
  268. // opening delimiter for this pattern, parse special
  269. // strings "[", "[^", "[-", and "[^-". Check for stand-in
  270. // characters representing a nested set in the symbol
  271. // table.
  272. else {
  273. // Prepare to backup if necessary
  274. chars.getPos(backup);
  275. c = chars.next(opts, literal, ec);
  276. if (U_FAILURE(ec)) return;
  277. if (c == u'[' && !literal) {
  278. if (mode == 1) {
  279. chars.setPos(backup); // backup
  280. setMode = 1;
  281. } else {
  282. // Handle opening '[' delimiter
  283. mode = 1;
  284. patLocal.append(u'[');
  285. chars.getPos(backup); // prepare to backup
  286. c = chars.next(opts, literal, ec);
  287. if (U_FAILURE(ec)) return;
  288. if (c == u'^' && !literal) {
  289. invert = true;
  290. patLocal.append(u'^');
  291. chars.getPos(backup); // prepare to backup
  292. c = chars.next(opts, literal, ec);
  293. if (U_FAILURE(ec)) return;
  294. }
  295. // Fall through to handle special leading '-';
  296. // otherwise restart loop for nested [], \p{}, etc.
  297. if (c == u'-') {
  298. literal = true;
  299. // Fall through to handle literal '-' below
  300. } else {
  301. chars.setPos(backup); // backup
  302. continue;
  303. }
  304. }
  305. } else if (symbols != 0) {
  306. const UnicodeFunctor *m = symbols->lookupMatcher(c);
  307. if (m != 0) {
  308. const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
  309. if (ms == nullptr) {
  310. ec = U_MALFORMED_SET;
  311. return;
  312. }
  313. // casting away const, but `nested' won't be modified
  314. // (important not to modify stored set)
  315. nested = const_cast<UnicodeSet*>(ms);
  316. setMode = 3;
  317. }
  318. }
  319. }
  320. // -------- Handle a nested set. This either is inline in
  321. // the pattern or represented by a stand-in that has
  322. // previously been parsed and was looked up in the symbol
  323. // table.
  324. if (setMode != 0) {
  325. if (lastItem == 1) {
  326. if (op != 0) {
  327. // syntaxError(chars, "Char expected after operator");
  328. ec = U_MALFORMED_SET;
  329. return;
  330. }
  331. add(lastChar, lastChar);
  332. _appendToPat(patLocal, lastChar, false);
  333. lastItem = 0;
  334. op = 0;
  335. }
  336. if (op == u'-' || op == u'&') {
  337. patLocal.append(op);
  338. }
  339. if (nested == 0) {
  340. // lazy allocation
  341. if (!scratch.allocate()) {
  342. ec = U_MEMORY_ALLOCATION_ERROR;
  343. return;
  344. }
  345. nested = scratch.pointer();
  346. }
  347. switch (setMode) {
  348. case 1:
  349. nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
  350. break;
  351. case 2:
  352. chars.skipIgnored(opts);
  353. nested->applyPropertyPattern(chars, patLocal, ec);
  354. if (U_FAILURE(ec)) return;
  355. break;
  356. case 3: // `nested' already parsed
  357. nested->_toPattern(patLocal, false);
  358. break;
  359. }
  360. usePat = true;
  361. if (mode == 0) {
  362. // Entire pattern is a category; leave parse loop
  363. *this = *nested;
  364. mode = 2;
  365. break;
  366. }
  367. switch (op) {
  368. case u'-':
  369. removeAll(*nested);
  370. break;
  371. case u'&':
  372. retainAll(*nested);
  373. break;
  374. case 0:
  375. addAll(*nested);
  376. break;
  377. }
  378. op = 0;
  379. lastItem = 2;
  380. continue;
  381. }
  382. if (mode == 0) {
  383. // syntaxError(chars, "Missing '['");
  384. ec = U_MALFORMED_SET;
  385. return;
  386. }
  387. // -------- Parse special (syntax) characters. If the
  388. // current character is not special, or if it is escaped,
  389. // then fall through and handle it below.
  390. if (!literal) {
  391. switch (c) {
  392. case u']':
  393. if (lastItem == 1) {
  394. add(lastChar, lastChar);
  395. _appendToPat(patLocal, lastChar, false);
  396. }
  397. // Treat final trailing '-' as a literal
  398. if (op == u'-') {
  399. add(op, op);
  400. patLocal.append(op);
  401. } else if (op == u'&') {
  402. // syntaxError(chars, "Trailing '&'");
  403. ec = U_MALFORMED_SET;
  404. return;
  405. }
  406. patLocal.append(u']');
  407. mode = 2;
  408. continue;
  409. case u'-':
  410. if (op == 0) {
  411. if (lastItem != 0) {
  412. op = (char16_t) c;
  413. continue;
  414. } else {
  415. // Treat final trailing '-' as a literal
  416. add(c, c);
  417. c = chars.next(opts, literal, ec);
  418. if (U_FAILURE(ec)) return;
  419. if (c == u']' && !literal) {
  420. patLocal.append(u"-]", 2);
  421. mode = 2;
  422. continue;
  423. }
  424. }
  425. }
  426. // syntaxError(chars, "'-' not after char or set");
  427. ec = U_MALFORMED_SET;
  428. return;
  429. case u'&':
  430. if (lastItem == 2 && op == 0) {
  431. op = (char16_t) c;
  432. continue;
  433. }
  434. // syntaxError(chars, "'&' not after set");
  435. ec = U_MALFORMED_SET;
  436. return;
  437. case u'^':
  438. // syntaxError(chars, "'^' not after '['");
  439. ec = U_MALFORMED_SET;
  440. return;
  441. case u'{':
  442. if (op != 0) {
  443. // syntaxError(chars, "Missing operand after operator");
  444. ec = U_MALFORMED_SET;
  445. return;
  446. }
  447. if (lastItem == 1) {
  448. add(lastChar, lastChar);
  449. _appendToPat(patLocal, lastChar, false);
  450. }
  451. lastItem = 0;
  452. buf.truncate(0);
  453. {
  454. UBool ok = false;
  455. while (!chars.atEnd()) {
  456. c = chars.next(opts, literal, ec);
  457. if (U_FAILURE(ec)) return;
  458. if (c == u'}' && !literal) {
  459. ok = true;
  460. break;
  461. }
  462. buf.append(c);
  463. }
  464. if (!ok) {
  465. // syntaxError(chars, "Invalid multicharacter string");
  466. ec = U_MALFORMED_SET;
  467. return;
  468. }
  469. }
  470. // We have new string. Add it to set and continue;
  471. // we don't need to drop through to the further
  472. // processing
  473. add(buf);
  474. patLocal.append(u'{');
  475. _appendToPat(patLocal, buf, false);
  476. patLocal.append(u'}');
  477. continue;
  478. case SymbolTable::SYMBOL_REF:
  479. // symbols nosymbols
  480. // [a-$] error error (ambiguous)
  481. // [a$] anchor anchor
  482. // [a-$x] var "x"* literal '$'
  483. // [a-$.] error literal '$'
  484. // *We won't get here in the case of var "x"
  485. {
  486. chars.getPos(backup);
  487. c = chars.next(opts, literal, ec);
  488. if (U_FAILURE(ec)) return;
  489. UBool anchor = (c == u']' && !literal);
  490. if (symbols == 0 && !anchor) {
  491. c = SymbolTable::SYMBOL_REF;
  492. chars.setPos(backup);
  493. break; // literal '$'
  494. }
  495. if (anchor && op == 0) {
  496. if (lastItem == 1) {
  497. add(lastChar, lastChar);
  498. _appendToPat(patLocal, lastChar, false);
  499. }
  500. add(U_ETHER);
  501. usePat = true;
  502. patLocal.append((char16_t) SymbolTable::SYMBOL_REF);
  503. patLocal.append(u']');
  504. mode = 2;
  505. continue;
  506. }
  507. // syntaxError(chars, "Unquoted '$'");
  508. ec = U_MALFORMED_SET;
  509. return;
  510. }
  511. default:
  512. break;
  513. }
  514. }
  515. // -------- Parse literal characters. This includes both
  516. // escaped chars ("\u4E01") and non-syntax characters
  517. // ("a").
  518. switch (lastItem) {
  519. case 0:
  520. lastItem = 1;
  521. lastChar = c;
  522. break;
  523. case 1:
  524. if (op == u'-') {
  525. if (lastChar >= c) {
  526. // Don't allow redundant (a-a) or empty (b-a) ranges;
  527. // these are most likely typos.
  528. // syntaxError(chars, "Invalid range");
  529. ec = U_MALFORMED_SET;
  530. return;
  531. }
  532. add(lastChar, c);
  533. _appendToPat(patLocal, lastChar, false);
  534. patLocal.append(op);
  535. _appendToPat(patLocal, c, false);
  536. lastItem = 0;
  537. op = 0;
  538. } else {
  539. add(lastChar, lastChar);
  540. _appendToPat(patLocal, lastChar, false);
  541. lastChar = c;
  542. }
  543. break;
  544. case 2:
  545. if (op != 0) {
  546. // syntaxError(chars, "Set expected after operator");
  547. ec = U_MALFORMED_SET;
  548. return;
  549. }
  550. lastChar = c;
  551. lastItem = 1;
  552. break;
  553. }
  554. }
  555. if (mode != 2) {
  556. // syntaxError(chars, "Missing ']'");
  557. ec = U_MALFORMED_SET;
  558. return;
  559. }
  560. chars.skipIgnored(opts);
  561. /**
  562. * Handle global flags (invert, case insensitivity). If this
  563. * pattern should be compiled case-insensitive, then we need
  564. * to close over case BEFORE COMPLEMENTING. This makes
  565. * patterns like /[^abc]/i work.
  566. */
  567. if ((options & USET_CASE_MASK) != 0) {
  568. (this->*caseClosure)(options);
  569. }
  570. if (invert) {
  571. complement().removeAllStrings(); // code point complement
  572. }
  573. // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
  574. // generated pattern.
  575. if (usePat) {
  576. rebuiltPat.append(patLocal);
  577. } else {
  578. _generatePattern(rebuiltPat, false);
  579. }
  580. if (isBogus() && U_SUCCESS(ec)) {
  581. // We likely ran out of memory. AHHH!
  582. ec = U_MEMORY_ALLOCATION_ERROR;
  583. }
  584. }
  585. //----------------------------------------------------------------
  586. // Property set implementation
  587. //----------------------------------------------------------------
  588. namespace {
  589. static UBool numericValueFilter(UChar32 ch, void* context) {
  590. return u_getNumericValue(ch) == *(double*)context;
  591. }
  592. static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
  593. int32_t value = *(int32_t*)context;
  594. return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
  595. }
  596. static UBool versionFilter(UChar32 ch, void* context) {
  597. static const UVersionInfo none = { 0, 0, 0, 0 };
  598. UVersionInfo v;
  599. u_charAge(ch, v);
  600. UVersionInfo* version = (UVersionInfo*)context;
  601. return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
  602. }
  603. typedef struct {
  604. UProperty prop;
  605. int32_t value;
  606. } IntPropertyContext;
  607. static UBool intPropertyFilter(UChar32 ch, void* context) {
  608. IntPropertyContext* c = (IntPropertyContext*)context;
  609. return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
  610. }
  611. static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
  612. return uscript_hasScript(ch, *(UScriptCode*)context);
  613. }
  614. } // namespace
  615. /**
  616. * Generic filter-based scanning code for UCD property UnicodeSets.
  617. */
  618. void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
  619. void* context,
  620. const UnicodeSet* inclusions,
  621. UErrorCode &status) {
  622. if (U_FAILURE(status)) return;
  623. // Logically, walk through all Unicode characters, noting the start
  624. // and end of each range for which filter.contain(c) is
  625. // true. Add each range to a set.
  626. //
  627. // To improve performance, use an inclusions set which
  628. // encodes information about character ranges that are known
  629. // to have identical properties.
  630. // inclusions contains the first characters of
  631. // same-value ranges for the given property.
  632. clear();
  633. UChar32 startHasProperty = -1;
  634. int32_t limitRange = inclusions->getRangeCount();
  635. for (int j=0; j<limitRange; ++j) {
  636. // get current range
  637. UChar32 start = inclusions->getRangeStart(j);
  638. UChar32 end = inclusions->getRangeEnd(j);
  639. // for all the code points in the range, process
  640. for (UChar32 ch = start; ch <= end; ++ch) {
  641. // only add to this UnicodeSet on inflection points --
  642. // where the hasProperty value changes to false
  643. if ((*filter)(ch, context)) {
  644. if (startHasProperty < 0) {
  645. startHasProperty = ch;
  646. }
  647. } else if (startHasProperty >= 0) {
  648. add(startHasProperty, ch-1);
  649. startHasProperty = -1;
  650. }
  651. }
  652. }
  653. if (startHasProperty >= 0) {
  654. add((UChar32)startHasProperty, (UChar32)0x10FFFF);
  655. }
  656. if (isBogus() && U_SUCCESS(status)) {
  657. // We likely ran out of memory. AHHH!
  658. status = U_MEMORY_ALLOCATION_ERROR;
  659. }
  660. }
  661. namespace {
  662. static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
  663. /* Note: we use ' ' in compiler code page */
  664. int32_t j = 0;
  665. char ch;
  666. --dstCapacity; /* make room for term. zero */
  667. while ((ch = *src++) != 0) {
  668. if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
  669. continue;
  670. }
  671. if (j >= dstCapacity) return false;
  672. dst[j++] = ch;
  673. }
  674. if (j > 0 && dst[j-1] == ' ') --j;
  675. dst[j] = 0;
  676. return true;
  677. }
  678. } // namespace
  679. //----------------------------------------------------------------
  680. // Property set API
  681. //----------------------------------------------------------------
  682. #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
  683. ec=U_ILLEGAL_ARGUMENT_ERROR; \
  684. return *this; \
  685. } UPRV_BLOCK_MACRO_END
  686. UnicodeSet&
  687. UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
  688. if (U_FAILURE(ec) || isFrozen()) { return *this; }
  689. if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
  690. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  691. applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
  692. } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
  693. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  694. UScriptCode script = (UScriptCode)value;
  695. applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
  696. } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
  697. if (value == 0 || value == 1) {
  698. const USet *set = u_getBinaryPropertySet(prop, &ec);
  699. if (U_FAILURE(ec)) { return *this; }
  700. copyFrom(*UnicodeSet::fromUSet(set), true);
  701. if (value == 0) {
  702. complement().removeAllStrings(); // code point complement
  703. }
  704. } else {
  705. clear();
  706. }
  707. } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
  708. const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
  709. IntPropertyContext c = {prop, value};
  710. applyFilter(intPropertyFilter, &c, inclusions, ec);
  711. } else {
  712. ec = U_ILLEGAL_ARGUMENT_ERROR;
  713. }
  714. return *this;
  715. }
  716. UnicodeSet&
  717. UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
  718. const UnicodeString& value,
  719. UErrorCode& ec) {
  720. if (U_FAILURE(ec) || isFrozen()) return *this;
  721. // prop and value used to be converted to char * using the default
  722. // converter instead of the invariant conversion.
  723. // This should not be necessary because all Unicode property and value
  724. // names use only invariant characters.
  725. // If there are any variant characters, then we won't find them anyway.
  726. // Checking first avoids assertion failures in the conversion.
  727. if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
  728. !uprv_isInvariantUString(value.getBuffer(), value.length())
  729. ) {
  730. FAIL(ec);
  731. }
  732. CharString pname, vname;
  733. pname.appendInvariantChars(prop, ec);
  734. vname.appendInvariantChars(value, ec);
  735. if (U_FAILURE(ec)) return *this;
  736. UProperty p;
  737. int32_t v;
  738. UBool invert = false;
  739. if (value.length() > 0) {
  740. p = u_getPropertyEnum(pname.data());
  741. if (p == UCHAR_INVALID_CODE) FAIL(ec);
  742. // Treat gc as gcm
  743. if (p == UCHAR_GENERAL_CATEGORY) {
  744. p = UCHAR_GENERAL_CATEGORY_MASK;
  745. }
  746. if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
  747. (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
  748. (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
  749. v = u_getPropertyValueEnum(p, vname.data());
  750. if (v == UCHAR_INVALID_CODE) {
  751. // Handle numeric CCC
  752. if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
  753. p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
  754. p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
  755. char* end;
  756. double val = uprv_strtod(vname.data(), &end);
  757. // Anything between 0 and 255 is valid even if unused.
  758. // Cast double->int only after range check.
  759. // We catch NaN here because comparing it with both 0 and 255 will be false
  760. // (as are all comparisons with NaN).
  761. if (*end != 0 || !(0 <= val && val <= 255) ||
  762. (v = (int32_t)val) != val) {
  763. // non-integral value or outside 0..255, or trailing junk
  764. FAIL(ec);
  765. }
  766. } else {
  767. FAIL(ec);
  768. }
  769. }
  770. }
  771. else {
  772. switch (p) {
  773. case UCHAR_NUMERIC_VALUE:
  774. {
  775. char* end;
  776. double val = uprv_strtod(vname.data(), &end);
  777. if (*end != 0) {
  778. FAIL(ec);
  779. }
  780. applyFilter(numericValueFilter, &val,
  781. CharacterProperties::getInclusionsForProperty(p, ec), ec);
  782. return *this;
  783. }
  784. case UCHAR_NAME:
  785. {
  786. // Must munge name, since u_charFromName() does not do
  787. // 'loose' matching.
  788. char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
  789. if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  790. UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
  791. if (U_SUCCESS(ec)) {
  792. clear();
  793. add(ch);
  794. return *this;
  795. } else {
  796. FAIL(ec);
  797. }
  798. }
  799. case UCHAR_UNICODE_1_NAME:
  800. // ICU 49 deprecates the Unicode_1_Name property APIs.
  801. FAIL(ec);
  802. case UCHAR_AGE:
  803. {
  804. // Must munge name, since u_versionFromString() does not do
  805. // 'loose' matching.
  806. char buf[128];
  807. if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
  808. UVersionInfo version;
  809. u_versionFromString(version, buf);
  810. applyFilter(versionFilter, &version,
  811. CharacterProperties::getInclusionsForProperty(p, ec), ec);
  812. return *this;
  813. }
  814. case UCHAR_SCRIPT_EXTENSIONS:
  815. v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
  816. if (v == UCHAR_INVALID_CODE) {
  817. FAIL(ec);
  818. }
  819. // fall through to calling applyIntPropertyValue()
  820. break;
  821. default:
  822. // p is a non-binary, non-enumerated property that we
  823. // don't support (yet).
  824. FAIL(ec);
  825. }
  826. }
  827. }
  828. else {
  829. // value is empty. Interpret as General Category, Script, or
  830. // Binary property.
  831. p = UCHAR_GENERAL_CATEGORY_MASK;
  832. v = u_getPropertyValueEnum(p, pname.data());
  833. if (v == UCHAR_INVALID_CODE) {
  834. p = UCHAR_SCRIPT;
  835. v = u_getPropertyValueEnum(p, pname.data());
  836. if (v == UCHAR_INVALID_CODE) {
  837. p = u_getPropertyEnum(pname.data());
  838. if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
  839. v = 1;
  840. } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
  841. set(MIN_VALUE, MAX_VALUE);
  842. return *this;
  843. } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
  844. set(0, 0x7F);
  845. return *this;
  846. } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
  847. // [:Assigned:]=[:^Cn:]
  848. p = UCHAR_GENERAL_CATEGORY_MASK;
  849. v = U_GC_CN_MASK;
  850. invert = true;
  851. } else {
  852. FAIL(ec);
  853. }
  854. }
  855. }
  856. }
  857. applyIntPropertyValue(p, v, ec);
  858. if(invert) {
  859. complement().removeAllStrings(); // code point complement
  860. }
  861. if (isBogus() && U_SUCCESS(ec)) {
  862. // We likely ran out of memory. AHHH!
  863. ec = U_MEMORY_ALLOCATION_ERROR;
  864. }
  865. return *this;
  866. }
  867. //----------------------------------------------------------------
  868. // Property set patterns
  869. //----------------------------------------------------------------
  870. /**
  871. * Return true if the given position, in the given pattern, appears
  872. * to be the start of a property set pattern.
  873. */
  874. UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
  875. int32_t pos) {
  876. // Patterns are at least 5 characters long
  877. if ((pos+5) > pattern.length()) {
  878. return false;
  879. }
  880. // Look for an opening [:, [:^, \p, or \P
  881. return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
  882. }
  883. /**
  884. * Return true if the given iterator appears to point at a
  885. * property pattern. Regardless of the result, return with the
  886. * iterator unchanged.
  887. * @param chars iterator over the pattern characters. Upon return
  888. * it will be unchanged.
  889. * @param iterOpts RuleCharacterIterator options
  890. */
  891. UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
  892. int32_t iterOpts) {
  893. // NOTE: literal will always be false, because we don't parse escapes.
  894. UBool result = false, literal;
  895. UErrorCode ec = U_ZERO_ERROR;
  896. iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
  897. RuleCharacterIterator::Pos pos;
  898. chars.getPos(pos);
  899. UChar32 c = chars.next(iterOpts, literal, ec);
  900. if (c == u'[' || c == u'\\') {
  901. UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
  902. literal, ec);
  903. result = (c == u'[') ? (d == u':') :
  904. (d == u'N' || d == u'p' || d == u'P');
  905. }
  906. chars.setPos(pos);
  907. return result && U_SUCCESS(ec);
  908. }
  909. /**
  910. * Parse the given property pattern at the given parse position.
  911. */
  912. UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
  913. ParsePosition& ppos,
  914. UErrorCode &ec) {
  915. int32_t pos = ppos.getIndex();
  916. UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
  917. UBool isName = false; // true for \N{pat}, o/w false
  918. UBool invert = false;
  919. if (U_FAILURE(ec)) return *this;
  920. // Minimum length is 5 characters, e.g. \p{L}
  921. if ((pos+5) > pattern.length()) {
  922. FAIL(ec);
  923. }
  924. // On entry, ppos should point to one of the following locations:
  925. // Look for an opening [:, [:^, \p, or \P
  926. if (isPOSIXOpen(pattern, pos)) {
  927. posix = true;
  928. pos += 2;
  929. pos = ICU_Utility::skipWhitespace(pattern, pos);
  930. if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
  931. ++pos;
  932. invert = true;
  933. }
  934. } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
  935. char16_t c = pattern.charAt(pos+1);
  936. invert = (c == u'P');
  937. isName = (c == u'N');
  938. pos += 2;
  939. pos = ICU_Utility::skipWhitespace(pattern, pos);
  940. if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
  941. // Syntax error; "\p" or "\P" not followed by "{"
  942. FAIL(ec);
  943. }
  944. } else {
  945. // Open delimiter not seen
  946. FAIL(ec);
  947. }
  948. // Look for the matching close delimiter, either :] or }
  949. int32_t close;
  950. if (posix) {
  951. close = pattern.indexOf(u":]", 2, pos);
  952. } else {
  953. close = pattern.indexOf(u'}', pos);
  954. }
  955. if (close < 0) {
  956. // Syntax error; close delimiter missing
  957. FAIL(ec);
  958. }
  959. // Look for an '=' sign. If this is present, we will parse a
  960. // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
  961. // pattern.
  962. int32_t equals = pattern.indexOf(u'=', pos);
  963. UnicodeString propName, valueName;
  964. if (equals >= 0 && equals < close && !isName) {
  965. // Equals seen; parse medium/long pattern
  966. pattern.extractBetween(pos, equals, propName);
  967. pattern.extractBetween(equals+1, close, valueName);
  968. }
  969. else {
  970. // Handle case where no '=' is seen, and \N{}
  971. pattern.extractBetween(pos, close, propName);
  972. // Handle \N{name}
  973. if (isName) {
  974. // This is a little inefficient since it means we have to
  975. // parse NAME_PROP back to UCHAR_NAME even though we already
  976. // know it's UCHAR_NAME. If we refactor the API to
  977. // support args of (UProperty, char*) then we can remove
  978. // NAME_PROP and make this a little more efficient.
  979. valueName = propName;
  980. propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
  981. }
  982. }
  983. applyPropertyAlias(propName, valueName, ec);
  984. if (U_SUCCESS(ec)) {
  985. if (invert) {
  986. complement().removeAllStrings(); // code point complement
  987. }
  988. // Move to the limit position after the close delimiter if the
  989. // parse succeeded.
  990. ppos.setIndex(close + (posix ? 2 : 1));
  991. }
  992. return *this;
  993. }
  994. /**
  995. * Parse a property pattern.
  996. * @param chars iterator over the pattern characters. Upon return
  997. * it will be advanced to the first character after the parsed
  998. * pattern, or the end of the iteration if all characters are
  999. * parsed.
  1000. * @param rebuiltPat the pattern that was parsed, rebuilt or
  1001. * copied from the input pattern, as appropriate.
  1002. */
  1003. void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
  1004. UnicodeString& rebuiltPat,
  1005. UErrorCode& ec) {
  1006. if (U_FAILURE(ec)) return;
  1007. UnicodeString pattern;
  1008. chars.lookahead(pattern);
  1009. ParsePosition pos(0);
  1010. applyPropertyPattern(pattern, pos, ec);
  1011. if (U_FAILURE(ec)) return;
  1012. if (pos.getIndex() == 0) {
  1013. // syntaxError(chars, "Invalid property pattern");
  1014. ec = U_MALFORMED_SET;
  1015. return;
  1016. }
  1017. chars.jumpahead(pos.getIndex());
  1018. rebuiltPat.append(pattern, 0, pos.getIndex());
  1019. }
  1020. U_NAMESPACE_END