ucharstriebuilder.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2010-2012, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: ucharstriebuilder.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2010nov14
  14. * created by: Markus W. Scherer
  15. */
  16. #include "unicode/utypes.h"
  17. #include "unicode/ucharstrie.h"
  18. #include "unicode/ucharstriebuilder.h"
  19. #include "unicode/unistr.h"
  20. #include "unicode/ustring.h"
  21. #include "cmemory.h"
  22. #include "uarrsort.h"
  23. #include "uassert.h"
  24. #include "uhash.h"
  25. #include "ustr_imp.h"
  26. U_NAMESPACE_BEGIN
  27. /*
  28. * Note: This builder implementation stores (string, value) pairs with full copies
  29. * of the 16-bit-unit sequences, until the UCharsTrie is built.
  30. * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
  31. */
  32. class UCharsTrieElement : public UMemory {
  33. public:
  34. // Use compiler's default constructor, initializes nothing.
  35. void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
  36. UnicodeString getString(const UnicodeString &strings) const {
  37. int32_t length=strings[stringOffset];
  38. return strings.tempSubString(stringOffset+1, length);
  39. }
  40. int32_t getStringLength(const UnicodeString &strings) const {
  41. return strings[stringOffset];
  42. }
  43. char16_t charAt(int32_t index, const UnicodeString &strings) const {
  44. return strings[stringOffset+1+index];
  45. }
  46. int32_t getValue() const { return value; }
  47. int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
  48. private:
  49. // The first strings unit contains the string length.
  50. // (Compared with a stringLength field here, this saves 2 bytes per string.)
  51. int32_t stringOffset;
  52. int32_t value;
  53. };
  54. void
  55. UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
  56. UnicodeString &strings, UErrorCode &errorCode) {
  57. if(U_FAILURE(errorCode)) {
  58. return;
  59. }
  60. int32_t length=s.length();
  61. if(length>0xffff) {
  62. // Too long: We store the length in 1 unit.
  63. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  64. return;
  65. }
  66. stringOffset=strings.length();
  67. strings.append(static_cast<char16_t>(length));
  68. value=val;
  69. strings.append(s);
  70. }
  71. int32_t
  72. UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
  73. return getString(strings).compare(other.getString(strings));
  74. }
  75. UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
  76. : elements(nullptr), elementsCapacity(0), elementsLength(0),
  77. uchars(nullptr), ucharsCapacity(0), ucharsLength(0) {}
  78. UCharsTrieBuilder::~UCharsTrieBuilder() {
  79. delete[] elements;
  80. uprv_free(uchars);
  81. }
  82. UCharsTrieBuilder &
  83. UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
  84. if(U_FAILURE(errorCode)) {
  85. return *this;
  86. }
  87. if(ucharsLength>0) {
  88. // Cannot add elements after building.
  89. errorCode=U_NO_WRITE_PERMISSION;
  90. return *this;
  91. }
  92. if(elementsLength==elementsCapacity) {
  93. int32_t newCapacity;
  94. if(elementsCapacity==0) {
  95. newCapacity=1024;
  96. } else {
  97. newCapacity=4*elementsCapacity;
  98. }
  99. UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
  100. if(newElements==nullptr) {
  101. errorCode=U_MEMORY_ALLOCATION_ERROR;
  102. return *this;
  103. }
  104. if(elementsLength>0) {
  105. uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
  106. }
  107. delete[] elements;
  108. elements=newElements;
  109. elementsCapacity=newCapacity;
  110. }
  111. elements[elementsLength++].setTo(s, value, strings, errorCode);
  112. if(U_SUCCESS(errorCode) && strings.isBogus()) {
  113. errorCode=U_MEMORY_ALLOCATION_ERROR;
  114. }
  115. return *this;
  116. }
  117. U_CDECL_BEGIN
  118. static int32_t U_CALLCONV
  119. compareElementStrings(const void *context, const void *left, const void *right) {
  120. const UnicodeString *strings=static_cast<const UnicodeString *>(context);
  121. const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
  122. const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
  123. return leftElement->compareStringTo(*rightElement, *strings);
  124. }
  125. U_CDECL_END
  126. UCharsTrie *
  127. UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
  128. buildUChars(buildOption, errorCode);
  129. UCharsTrie *newTrie=nullptr;
  130. if(U_SUCCESS(errorCode)) {
  131. newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
  132. if(newTrie==nullptr) {
  133. errorCode=U_MEMORY_ALLOCATION_ERROR;
  134. } else {
  135. uchars=nullptr; // The new trie now owns the array.
  136. ucharsCapacity=0;
  137. }
  138. }
  139. return newTrie;
  140. }
  141. UnicodeString &
  142. UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
  143. UErrorCode &errorCode) {
  144. buildUChars(buildOption, errorCode);
  145. if(U_SUCCESS(errorCode)) {
  146. result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
  147. }
  148. return result;
  149. }
  150. void
  151. UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
  152. if(U_FAILURE(errorCode)) {
  153. return;
  154. }
  155. if(uchars!=nullptr && ucharsLength>0) {
  156. // Already built.
  157. return;
  158. }
  159. if(ucharsLength==0) {
  160. if(elementsLength==0) {
  161. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  162. return;
  163. }
  164. if(strings.isBogus()) {
  165. errorCode=U_MEMORY_ALLOCATION_ERROR;
  166. return;
  167. }
  168. uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(UCharsTrieElement)),
  169. compareElementStrings, &strings,
  170. false, // need not be a stable sort
  171. &errorCode);
  172. if(U_FAILURE(errorCode)) {
  173. return;
  174. }
  175. // Duplicate strings are not allowed.
  176. UnicodeString prev=elements[0].getString(strings);
  177. for(int32_t i=1; i<elementsLength; ++i) {
  178. UnicodeString current=elements[i].getString(strings);
  179. if(prev==current) {
  180. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  181. return;
  182. }
  183. prev.fastCopyFrom(current);
  184. }
  185. }
  186. // Create and char16_t-serialize the trie for the elements.
  187. ucharsLength=0;
  188. int32_t capacity=strings.length();
  189. if(capacity<1024) {
  190. capacity=1024;
  191. }
  192. if(ucharsCapacity<capacity) {
  193. uprv_free(uchars);
  194. uchars=static_cast<char16_t *>(uprv_malloc(capacity*2));
  195. if(uchars==nullptr) {
  196. errorCode=U_MEMORY_ALLOCATION_ERROR;
  197. ucharsCapacity=0;
  198. return;
  199. }
  200. ucharsCapacity=capacity;
  201. }
  202. StringTrieBuilder::build(buildOption, elementsLength, errorCode);
  203. if(uchars==nullptr) {
  204. errorCode=U_MEMORY_ALLOCATION_ERROR;
  205. }
  206. }
  207. int32_t
  208. UCharsTrieBuilder::getElementStringLength(int32_t i) const {
  209. return elements[i].getStringLength(strings);
  210. }
  211. char16_t
  212. UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
  213. return elements[i].charAt(unitIndex, strings);
  214. }
  215. int32_t
  216. UCharsTrieBuilder::getElementValue(int32_t i) const {
  217. return elements[i].getValue();
  218. }
  219. int32_t
  220. UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
  221. const UCharsTrieElement &firstElement=elements[first];
  222. const UCharsTrieElement &lastElement=elements[last];
  223. int32_t minStringLength=firstElement.getStringLength(strings);
  224. while(++unitIndex<minStringLength &&
  225. firstElement.charAt(unitIndex, strings)==
  226. lastElement.charAt(unitIndex, strings)) {}
  227. return unitIndex;
  228. }
  229. int32_t
  230. UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
  231. int32_t length=0; // Number of different units at unitIndex.
  232. int32_t i=start;
  233. do {
  234. char16_t unit=elements[i++].charAt(unitIndex, strings);
  235. while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
  236. ++i;
  237. }
  238. ++length;
  239. } while(i<limit);
  240. return length;
  241. }
  242. int32_t
  243. UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
  244. do {
  245. char16_t unit=elements[i++].charAt(unitIndex, strings);
  246. while(unit==elements[i].charAt(unitIndex, strings)) {
  247. ++i;
  248. }
  249. } while(--count>0);
  250. return i;
  251. }
  252. int32_t
  253. UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const {
  254. while(unit==elements[i].charAt(unitIndex, strings)) {
  255. ++i;
  256. }
  257. return i;
  258. }
  259. UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode)
  260. : LinearMatchNode(len, nextNode), s(units) {
  261. hash=hash*37u+ustr_hashUCharsN(units, len);
  262. }
  263. bool
  264. UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
  265. if(this==&other) {
  266. return true;
  267. }
  268. if(!LinearMatchNode::operator==(other)) {
  269. return false;
  270. }
  271. const UCTLinearMatchNode &o=static_cast<const UCTLinearMatchNode &>(other);
  272. return 0==u_memcmp(s, o.s, length);
  273. }
  274. void
  275. UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
  276. UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
  277. next->write(builder);
  278. b.write(s, length);
  279. offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
  280. }
  281. StringTrieBuilder::Node *
  282. UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
  283. Node *nextNode) const {
  284. return new UCTLinearMatchNode(
  285. elements[i].getString(strings).getBuffer()+unitIndex,
  286. length,
  287. nextNode);
  288. }
  289. UBool
  290. UCharsTrieBuilder::ensureCapacity(int32_t length) {
  291. if(uchars==nullptr) {
  292. return false; // previous memory allocation had failed
  293. }
  294. if(length>ucharsCapacity) {
  295. int32_t newCapacity=ucharsCapacity;
  296. do {
  297. newCapacity*=2;
  298. } while(newCapacity<=length);
  299. char16_t *newUChars=static_cast<char16_t *>(uprv_malloc(newCapacity*2));
  300. if(newUChars==nullptr) {
  301. // unable to allocate memory
  302. uprv_free(uchars);
  303. uchars=nullptr;
  304. ucharsCapacity=0;
  305. return false;
  306. }
  307. u_memcpy(newUChars+(newCapacity-ucharsLength),
  308. uchars+(ucharsCapacity-ucharsLength), ucharsLength);
  309. uprv_free(uchars);
  310. uchars=newUChars;
  311. ucharsCapacity=newCapacity;
  312. }
  313. return true;
  314. }
  315. int32_t
  316. UCharsTrieBuilder::write(int32_t unit) {
  317. int32_t newLength=ucharsLength+1;
  318. if(ensureCapacity(newLength)) {
  319. ucharsLength=newLength;
  320. uchars[ucharsCapacity - ucharsLength] = static_cast<char16_t>(unit);
  321. }
  322. return ucharsLength;
  323. }
  324. int32_t
  325. UCharsTrieBuilder::write(const char16_t *s, int32_t length) {
  326. int32_t newLength=ucharsLength+length;
  327. if(ensureCapacity(newLength)) {
  328. ucharsLength=newLength;
  329. u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
  330. }
  331. return ucharsLength;
  332. }
  333. int32_t
  334. UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
  335. return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
  336. }
  337. int32_t
  338. UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
  339. if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
  340. return write(i|(isFinal<<15));
  341. }
  342. char16_t intUnits[3];
  343. int32_t length;
  344. if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
  345. intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitValueLead);
  346. intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(i) >> 16);
  347. intUnits[2] = static_cast<char16_t>(i);
  348. length=3;
  349. // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
  350. // intUnits[0]=(char16_t)(i);
  351. // length=1;
  352. } else {
  353. intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitValueLead + (i >> 16));
  354. intUnits[1] = static_cast<char16_t>(i);
  355. length=2;
  356. }
  357. intUnits[0] = static_cast<char16_t>(intUnits[0] | (isFinal << 15));
  358. return write(intUnits, length);
  359. }
  360. int32_t
  361. UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
  362. if(!hasValue) {
  363. return write(node);
  364. }
  365. char16_t intUnits[3];
  366. int32_t length;
  367. if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
  368. intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitNodeValueLead);
  369. intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(value) >> 16);
  370. intUnits[2] = static_cast<char16_t>(value);
  371. length=3;
  372. } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
  373. intUnits[0] = static_cast<char16_t>((value + 1) << 6);
  374. length=1;
  375. } else {
  376. intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitNodeValueLead + ((value >> 10) & 0x7fc0));
  377. intUnits[1] = static_cast<char16_t>(value);
  378. length=2;
  379. }
  380. intUnits[0] |= static_cast<char16_t>(node);
  381. return write(intUnits, length);
  382. }
  383. int32_t
  384. UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
  385. int32_t i=ucharsLength-jumpTarget;
  386. U_ASSERT(i>=0);
  387. if(i<=UCharsTrie::kMaxOneUnitDelta) {
  388. return write(i);
  389. }
  390. char16_t intUnits[3];
  391. int32_t length;
  392. if(i<=UCharsTrie::kMaxTwoUnitDelta) {
  393. intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitDeltaLead + (i >> 16));
  394. length=1;
  395. } else {
  396. intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitDeltaLead);
  397. intUnits[1] = static_cast<char16_t>(i >> 16);
  398. length=2;
  399. }
  400. intUnits[length++] = static_cast<char16_t>(i);
  401. return write(intUnits, length);
  402. }
  403. U_NAMESPACE_END