bytestriebuilder.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2010-2012, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: bytestriebuilder.cpp
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2010sep25
  14. * created by: Markus W. Scherer
  15. */
  16. #include "unicode/utypes.h"
  17. #include "unicode/bytestrie.h"
  18. #include "unicode/bytestriebuilder.h"
  19. #include "unicode/stringpiece.h"
  20. #include "charstr.h"
  21. #include "cmemory.h"
  22. #include "uhash.h"
  23. #include "uarrsort.h"
  24. #include "uassert.h"
  25. #include "ustr_imp.h"
  26. U_NAMESPACE_BEGIN
  27. /*
  28. * Note: This builder implementation stores (bytes, value) pairs with full copies
  29. * of the byte sequences, until the BytesTrie is built.
  30. * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
  31. */
  32. class BytesTrieElement : public UMemory {
  33. public:
  34. // Use compiler's default constructor, initializes nothing.
  35. void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
  36. StringPiece getString(const CharString &strings) const {
  37. int32_t offset=stringOffset;
  38. int32_t length;
  39. if(offset>=0) {
  40. length = static_cast<uint8_t>(strings[offset++]);
  41. } else {
  42. offset=~offset;
  43. length = (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
  44. offset+=2;
  45. }
  46. return StringPiece(strings.data()+offset, length);
  47. }
  48. int32_t getStringLength(const CharString &strings) const {
  49. int32_t offset=stringOffset;
  50. if(offset>=0) {
  51. return static_cast<uint8_t>(strings[offset]);
  52. } else {
  53. offset=~offset;
  54. return (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
  55. }
  56. }
  57. char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
  58. int32_t getValue() const { return value; }
  59. int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
  60. private:
  61. const char *data(const CharString &strings) const {
  62. int32_t offset=stringOffset;
  63. if(offset>=0) {
  64. ++offset;
  65. } else {
  66. offset=~offset+2;
  67. }
  68. return strings.data()+offset;
  69. }
  70. // If the stringOffset is non-negative, then the first strings byte contains
  71. // the string length.
  72. // If the stringOffset is negative, then the first two strings bytes contain
  73. // the string length (big-endian), and the offset needs to be bit-inverted.
  74. // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
  75. int32_t stringOffset;
  76. int32_t value;
  77. };
  78. void
  79. BytesTrieElement::setTo(StringPiece s, int32_t val,
  80. CharString &strings, UErrorCode &errorCode) {
  81. if(U_FAILURE(errorCode)) {
  82. return;
  83. }
  84. int32_t length=s.length();
  85. if(length>0xffff) {
  86. // Too long: We store the length in 1 or 2 bytes.
  87. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  88. return;
  89. }
  90. int32_t offset=strings.length();
  91. if(length>0xff) {
  92. offset=~offset;
  93. strings.append(static_cast<char>(length >> 8), errorCode);
  94. }
  95. strings.append(static_cast<char>(length), errorCode);
  96. stringOffset=offset;
  97. value=val;
  98. strings.append(s, errorCode);
  99. }
  100. int32_t
  101. BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
  102. // TODO: add StringPiece::compare(), see ticket #8187
  103. StringPiece thisString=getString(strings);
  104. StringPiece otherString=other.getString(strings);
  105. int32_t lengthDiff=thisString.length()-otherString.length();
  106. int32_t commonLength;
  107. if(lengthDiff<=0) {
  108. commonLength=thisString.length();
  109. } else {
  110. commonLength=otherString.length();
  111. }
  112. int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
  113. return diff!=0 ? diff : lengthDiff;
  114. }
  115. BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
  116. : strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
  117. bytes(nullptr), bytesCapacity(0), bytesLength(0) {
  118. if(U_FAILURE(errorCode)) {
  119. return;
  120. }
  121. strings=new CharString();
  122. if(strings==nullptr) {
  123. errorCode=U_MEMORY_ALLOCATION_ERROR;
  124. }
  125. }
  126. BytesTrieBuilder::~BytesTrieBuilder() {
  127. delete strings;
  128. delete[] elements;
  129. uprv_free(bytes);
  130. }
  131. BytesTrieBuilder &
  132. BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
  133. if(U_FAILURE(errorCode)) {
  134. return *this;
  135. }
  136. if(bytesLength>0) {
  137. // Cannot add elements after building.
  138. errorCode=U_NO_WRITE_PERMISSION;
  139. return *this;
  140. }
  141. if(elementsLength==elementsCapacity) {
  142. int32_t newCapacity;
  143. if(elementsCapacity==0) {
  144. newCapacity=1024;
  145. } else {
  146. newCapacity=4*elementsCapacity;
  147. }
  148. BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
  149. if(newElements==nullptr) {
  150. errorCode=U_MEMORY_ALLOCATION_ERROR;
  151. return *this; // error instead of dereferencing null
  152. }
  153. if(elementsLength>0) {
  154. uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
  155. }
  156. delete[] elements;
  157. elements=newElements;
  158. elementsCapacity=newCapacity;
  159. }
  160. elements[elementsLength++].setTo(s, value, *strings, errorCode);
  161. return *this;
  162. }
  163. U_CDECL_BEGIN
  164. static int32_t U_CALLCONV
  165. compareElementStrings(const void *context, const void *left, const void *right) {
  166. const CharString *strings=static_cast<const CharString *>(context);
  167. const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
  168. const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
  169. return leftElement->compareStringTo(*rightElement, *strings);
  170. }
  171. U_CDECL_END
  172. BytesTrie *
  173. BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
  174. buildBytes(buildOption, errorCode);
  175. BytesTrie *newTrie=nullptr;
  176. if(U_SUCCESS(errorCode)) {
  177. newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
  178. if(newTrie==nullptr) {
  179. errorCode=U_MEMORY_ALLOCATION_ERROR;
  180. } else {
  181. bytes=nullptr; // The new trie now owns the array.
  182. bytesCapacity=0;
  183. }
  184. }
  185. return newTrie;
  186. }
  187. StringPiece
  188. BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
  189. buildBytes(buildOption, errorCode);
  190. StringPiece result;
  191. if(U_SUCCESS(errorCode)) {
  192. result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
  193. }
  194. return result;
  195. }
  196. void
  197. BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
  198. if(U_FAILURE(errorCode)) {
  199. return;
  200. }
  201. if(bytes!=nullptr && bytesLength>0) {
  202. // Already built.
  203. return;
  204. }
  205. if(bytesLength==0) {
  206. if(elementsLength==0) {
  207. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  208. return;
  209. }
  210. uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(BytesTrieElement)),
  211. compareElementStrings, strings,
  212. false, // need not be a stable sort
  213. &errorCode);
  214. if(U_FAILURE(errorCode)) {
  215. return;
  216. }
  217. // Duplicate strings are not allowed.
  218. StringPiece prev=elements[0].getString(*strings);
  219. for(int32_t i=1; i<elementsLength; ++i) {
  220. StringPiece current=elements[i].getString(*strings);
  221. if(prev==current) {
  222. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  223. return;
  224. }
  225. prev=current;
  226. }
  227. }
  228. // Create and byte-serialize the trie for the elements.
  229. bytesLength=0;
  230. int32_t capacity=strings->length();
  231. if(capacity<1024) {
  232. capacity=1024;
  233. }
  234. if(bytesCapacity<capacity) {
  235. uprv_free(bytes);
  236. bytes=static_cast<char *>(uprv_malloc(capacity));
  237. if(bytes==nullptr) {
  238. errorCode=U_MEMORY_ALLOCATION_ERROR;
  239. bytesCapacity=0;
  240. return;
  241. }
  242. bytesCapacity=capacity;
  243. }
  244. StringTrieBuilder::build(buildOption, elementsLength, errorCode);
  245. if(bytes==nullptr) {
  246. errorCode=U_MEMORY_ALLOCATION_ERROR;
  247. }
  248. }
  249. BytesTrieBuilder &
  250. BytesTrieBuilder::clear() {
  251. strings->clear();
  252. elementsLength=0;
  253. bytesLength=0;
  254. return *this;
  255. }
  256. int32_t
  257. BytesTrieBuilder::getElementStringLength(int32_t i) const {
  258. return elements[i].getStringLength(*strings);
  259. }
  260. char16_t
  261. BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
  262. return static_cast<uint8_t>(elements[i].charAt(byteIndex, *strings));
  263. }
  264. int32_t
  265. BytesTrieBuilder::getElementValue(int32_t i) const {
  266. return elements[i].getValue();
  267. }
  268. int32_t
  269. BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
  270. const BytesTrieElement &firstElement=elements[first];
  271. const BytesTrieElement &lastElement=elements[last];
  272. int32_t minStringLength=firstElement.getStringLength(*strings);
  273. while(++byteIndex<minStringLength &&
  274. firstElement.charAt(byteIndex, *strings)==
  275. lastElement.charAt(byteIndex, *strings)) {}
  276. return byteIndex;
  277. }
  278. int32_t
  279. BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
  280. int32_t length=0; // Number of different bytes at byteIndex.
  281. int32_t i=start;
  282. do {
  283. char byte=elements[i++].charAt(byteIndex, *strings);
  284. while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
  285. ++i;
  286. }
  287. ++length;
  288. } while(i<limit);
  289. return length;
  290. }
  291. int32_t
  292. BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
  293. do {
  294. char byte=elements[i++].charAt(byteIndex, *strings);
  295. while(byte==elements[i].charAt(byteIndex, *strings)) {
  296. ++i;
  297. }
  298. } while(--count>0);
  299. return i;
  300. }
  301. int32_t
  302. BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
  303. char b = static_cast<char>(byte);
  304. while(b==elements[i].charAt(byteIndex, *strings)) {
  305. ++i;
  306. }
  307. return i;
  308. }
  309. BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
  310. : LinearMatchNode(len, nextNode), s(bytes) {
  311. hash=static_cast<int32_t>(
  312. static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
  313. }
  314. bool
  315. BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
  316. if(this==&other) {
  317. return true;
  318. }
  319. if(!LinearMatchNode::operator==(other)) {
  320. return false;
  321. }
  322. const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
  323. return 0==uprv_memcmp(s, o.s, length);
  324. }
  325. void
  326. BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
  327. BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
  328. next->write(builder);
  329. b.write(s, length);
  330. offset=b.write(b.getMinLinearMatch()+length-1);
  331. }
  332. StringTrieBuilder::Node *
  333. BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
  334. Node *nextNode) const {
  335. return new BTLinearMatchNode(
  336. elements[i].getString(*strings).data()+byteIndex,
  337. length,
  338. nextNode);
  339. }
  340. UBool
  341. BytesTrieBuilder::ensureCapacity(int32_t length) {
  342. if(bytes==nullptr) {
  343. return false; // previous memory allocation had failed
  344. }
  345. if(length>bytesCapacity) {
  346. int32_t newCapacity=bytesCapacity;
  347. do {
  348. newCapacity*=2;
  349. } while(newCapacity<=length);
  350. char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
  351. if(newBytes==nullptr) {
  352. // unable to allocate memory
  353. uprv_free(bytes);
  354. bytes=nullptr;
  355. bytesCapacity=0;
  356. return false;
  357. }
  358. uprv_memcpy(newBytes+(newCapacity-bytesLength),
  359. bytes+(bytesCapacity-bytesLength), bytesLength);
  360. uprv_free(bytes);
  361. bytes=newBytes;
  362. bytesCapacity=newCapacity;
  363. }
  364. return true;
  365. }
  366. int32_t
  367. BytesTrieBuilder::write(int32_t byte) {
  368. int32_t newLength=bytesLength+1;
  369. if(ensureCapacity(newLength)) {
  370. bytesLength=newLength;
  371. bytes[bytesCapacity - bytesLength] = static_cast<char>(byte);
  372. }
  373. return bytesLength;
  374. }
  375. int32_t
  376. BytesTrieBuilder::write(const char *b, int32_t length) {
  377. int32_t newLength=bytesLength+length;
  378. if(ensureCapacity(newLength)) {
  379. bytesLength=newLength;
  380. uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
  381. }
  382. return bytesLength;
  383. }
  384. int32_t
  385. BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
  386. return write(elements[i].getString(*strings).data()+byteIndex, length);
  387. }
  388. int32_t
  389. BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
  390. if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
  391. return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
  392. }
  393. char intBytes[5];
  394. int32_t length=1;
  395. if(i<0 || i>0xffffff) {
  396. intBytes[0] = static_cast<char>(BytesTrie::kFiveByteValueLead);
  397. intBytes[1] = static_cast<char>(static_cast<uint32_t>(i) >> 24);
  398. intBytes[2] = static_cast<char>(static_cast<uint32_t>(i) >> 16);
  399. intBytes[3] = static_cast<char>(static_cast<uint32_t>(i) >> 8);
  400. intBytes[4] = static_cast<char>(i);
  401. length=5;
  402. // } else if(i<=BytesTrie::kMaxOneByteValue) {
  403. // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
  404. } else {
  405. if(i<=BytesTrie::kMaxTwoByteValue) {
  406. intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteValueLead + (i >> 8));
  407. } else {
  408. if(i<=BytesTrie::kMaxThreeByteValue) {
  409. intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteValueLead + (i >> 16));
  410. } else {
  411. intBytes[0] = static_cast<char>(BytesTrie::kFourByteValueLead);
  412. intBytes[1] = static_cast<char>(i >> 16);
  413. length=2;
  414. }
  415. intBytes[length++] = static_cast<char>(i >> 8);
  416. }
  417. intBytes[length++] = static_cast<char>(i);
  418. }
  419. intBytes[0] = static_cast<char>((intBytes[0] << 1) | isFinal);
  420. return write(intBytes, length);
  421. }
  422. int32_t
  423. BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
  424. int32_t offset=write(node);
  425. if(hasValue) {
  426. offset=writeValueAndFinal(value, false);
  427. }
  428. return offset;
  429. }
  430. int32_t
  431. BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
  432. int32_t i=bytesLength-jumpTarget;
  433. U_ASSERT(i>=0);
  434. if(i<=BytesTrie::kMaxOneByteDelta) {
  435. return write(i);
  436. } else {
  437. char intBytes[5];
  438. return write(intBytes, internalEncodeDelta(i, intBytes));
  439. }
  440. }
  441. int32_t
  442. BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
  443. U_ASSERT(i>=0);
  444. if(i<=BytesTrie::kMaxOneByteDelta) {
  445. intBytes[0] = static_cast<char>(i);
  446. return 1;
  447. }
  448. int32_t length=1;
  449. if(i<=BytesTrie::kMaxTwoByteDelta) {
  450. intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteDeltaLead + (i >> 8));
  451. } else {
  452. if(i<=BytesTrie::kMaxThreeByteDelta) {
  453. intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteDeltaLead + (i >> 16));
  454. } else {
  455. if(i<=0xffffff) {
  456. intBytes[0] = static_cast<char>(BytesTrie::kFourByteDeltaLead);
  457. } else {
  458. intBytes[0] = static_cast<char>(BytesTrie::kFiveByteDeltaLead);
  459. intBytes[1] = static_cast<char>(i >> 24);
  460. length=2;
  461. }
  462. intBytes[length++] = static_cast<char>(i >> 16);
  463. }
  464. intBytes[length++] = static_cast<char>(i >> 8);
  465. }
  466. intBytes[length++] = static_cast<char>(i);
  467. return length;
  468. }
  469. U_NAMESPACE_END