rbbidata.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 1999-2014 International Business Machines Corporation *
  6. * and others. All rights reserved. *
  7. ***************************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #if !UCONFIG_NO_BREAK_ITERATION
  11. #include "unicode/ucptrie.h"
  12. #include "unicode/utypes.h"
  13. #include "rbbidata.h"
  14. #include "rbbirb.h"
  15. #include "udatamem.h"
  16. #include "cmemory.h"
  17. #include "cstring.h"
  18. #include "umutex.h"
  19. #include "uassert.h"
  20. U_NAMESPACE_BEGIN
  21. //-----------------------------------------------------------------------------
  22. //
  23. // Constructors.
  24. //
  25. //-----------------------------------------------------------------------------
  26. RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
  27. init0();
  28. init(data, status);
  29. }
  30. RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
  31. init0();
  32. init(data, status);
  33. fDontFreeData = true;
  34. }
  35. RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
  36. init0();
  37. if (U_FAILURE(status)) {
  38. return;
  39. }
  40. const DataHeader *dh = udm->pHeader;
  41. int32_t headerSize = dh->dataHeader.headerSize;
  42. if ( !(headerSize >= 20 &&
  43. dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
  44. dh->info.charsetFamily == U_CHARSET_FAMILY &&
  45. dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk "
  46. dh->info.dataFormat[1] == 0x72 &&
  47. dh->info.dataFormat[2] == 0x6b &&
  48. dh->info.dataFormat[3] == 0x20 &&
  49. isDataVersionAcceptable(dh->info.formatVersion))
  50. ) {
  51. status = U_INVALID_FORMAT_ERROR;
  52. return;
  53. }
  54. const char *dataAsBytes = reinterpret_cast<const char *>(dh);
  55. const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
  56. init(rbbidh, status);
  57. fUDataMem = udm;
  58. }
  59. UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
  60. return RBBI_DATA_FORMAT_VERSION[0] == version[0];
  61. }
  62. //-----------------------------------------------------------------------------
  63. //
  64. // init(). Does most of the work of construction, shared between the
  65. // constructors.
  66. //
  67. //-----------------------------------------------------------------------------
  68. void RBBIDataWrapper::init0() {
  69. fHeader = nullptr;
  70. fForwardTable = nullptr;
  71. fReverseTable = nullptr;
  72. fRuleSource = nullptr;
  73. fRuleStatusTable = nullptr;
  74. fTrie = nullptr;
  75. fUDataMem = nullptr;
  76. fRefCount = 0;
  77. fDontFreeData = true;
  78. }
  79. void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
  80. if (U_FAILURE(status)) {
  81. return;
  82. }
  83. fHeader = data;
  84. if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) {
  85. status = U_INVALID_FORMAT_ERROR;
  86. return;
  87. }
  88. // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
  89. // that is no longer supported. At that time fFormatVersion was
  90. // an int32_t field, rather than an array of 4 bytes.
  91. fDontFreeData = false;
  92. if (data->fFTableLen != 0) {
  93. fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
  94. }
  95. if (data->fRTableLen != 0) {
  96. fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
  97. }
  98. fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST,
  99. UCPTRIE_VALUE_BITS_ANY,
  100. (uint8_t *)data + fHeader->fTrie,
  101. fHeader->fTrieLen,
  102. nullptr, // *actual length
  103. &status);
  104. if (U_FAILURE(status)) {
  105. return;
  106. }
  107. UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie);
  108. if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) {
  109. status = U_INVALID_FORMAT_ERROR;
  110. return;
  111. }
  112. fRuleSource = ((char *)data + fHeader->fRuleSource);
  113. fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen));
  114. U_ASSERT(data->fRuleSourceLen > 0);
  115. fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
  116. fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
  117. fRefCount = 1;
  118. #ifdef RBBI_DEBUG
  119. char *debugEnv = getenv("U_RBBIDEBUG");
  120. if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
  121. #endif
  122. }
  123. //-----------------------------------------------------------------------------
  124. //
  125. // Destructor. Don't call this - use removeReference() instead.
  126. //
  127. //-----------------------------------------------------------------------------
  128. RBBIDataWrapper::~RBBIDataWrapper() {
  129. U_ASSERT(fRefCount == 0);
  130. ucptrie_close(fTrie);
  131. fTrie = nullptr;
  132. if (fUDataMem) {
  133. udata_close(fUDataMem);
  134. } else if (!fDontFreeData) {
  135. uprv_free((void *)fHeader);
  136. }
  137. }
  138. //-----------------------------------------------------------------------------
  139. //
  140. // Operator == Consider two RBBIDataWrappers to be equal if they
  141. // refer to the same underlying data. Although
  142. // the data wrappers are normally shared between
  143. // iterator instances, it's possible to independently
  144. // open the same data twice, and get two instances, which
  145. // should still be ==.
  146. //
  147. //-----------------------------------------------------------------------------
  148. bool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
  149. if (fHeader == other.fHeader) {
  150. return true;
  151. }
  152. if (fHeader->fLength != other.fHeader->fLength) {
  153. return false;
  154. }
  155. if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
  156. return true;
  157. }
  158. return false;
  159. }
  160. int32_t RBBIDataWrapper::hashCode() {
  161. return fHeader->fFTableLen;
  162. }
  163. //-----------------------------------------------------------------------------
  164. //
  165. // Reference Counting. A single RBBIDataWrapper object is shared among
  166. // however many RulesBasedBreakIterator instances are
  167. // referencing the same data.
  168. //
  169. //-----------------------------------------------------------------------------
  170. void RBBIDataWrapper::removeReference() {
  171. if (umtx_atomic_dec(&fRefCount) == 0) {
  172. delete this;
  173. }
  174. }
  175. RBBIDataWrapper *RBBIDataWrapper::addReference() {
  176. umtx_atomic_inc(&fRefCount);
  177. return this;
  178. }
  179. //-----------------------------------------------------------------------------
  180. //
  181. // getRuleSourceString
  182. //
  183. //-----------------------------------------------------------------------------
  184. const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
  185. return fRuleString;
  186. }
  187. //-----------------------------------------------------------------------------
  188. //
  189. // print - debugging function to dump the runtime data tables.
  190. //
  191. //-----------------------------------------------------------------------------
  192. #ifdef RBBI_DEBUG
  193. void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
  194. uint32_t c;
  195. uint32_t s;
  196. RBBIDebugPrintf("%s\n", heading);
  197. RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart);
  198. RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize);
  199. RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n",
  200. table->fFlags,
  201. table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F",
  202. table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F",
  203. table->fFlags & RBBI_8BITS_ROWS ? "T" : "F");
  204. RBBIDebugPrintf("\nState | Acc LA TagIx");
  205. for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
  206. RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
  207. RBBIDebugPrintf("----");
  208. }
  209. RBBIDebugPrintf("\n");
  210. if (table == nullptr) {
  211. RBBIDebugPrintf(" N U L L T A B L E\n\n");
  212. return;
  213. }
  214. UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS;
  215. for (s=0; s<table->fNumStates; s++) {
  216. RBBIStateTableRow *row = (RBBIStateTableRow *)
  217. (table->fTableData + (table->fRowLen * s));
  218. if (use8Bits) {
  219. RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagsIdx);
  220. for (c=0; c<fHeader->fCatCount; c++) {
  221. RBBIDebugPrintf("%3d ", row->r8.fNextState[c]);
  222. }
  223. } else {
  224. RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagsIdx);
  225. for (c=0; c<fHeader->fCatCount; c++) {
  226. RBBIDebugPrintf("%3d ", row->r16.fNextState[c]);
  227. }
  228. }
  229. RBBIDebugPrintf("\n");
  230. }
  231. RBBIDebugPrintf("\n");
  232. }
  233. #endif
  234. void RBBIDataWrapper::printData() {
  235. #ifdef RBBI_DEBUG
  236. RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
  237. RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
  238. fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
  239. RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
  240. RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
  241. printTable("Forward State Transition Table", fForwardTable);
  242. printTable("Reverse State Transition Table", fReverseTable);
  243. RBBIDebugPrintf("\nOriginal Rules source:\n");
  244. for (int32_t c=0; fRuleSource[c] != 0; c++) {
  245. RBBIDebugPrintf("%c", fRuleSource[c]);
  246. }
  247. RBBIDebugPrintf("\n\n");
  248. #endif
  249. }
  250. U_NAMESPACE_END
  251. U_NAMESPACE_USE
  252. //-----------------------------------------------------------------------------
  253. //
  254. // ubrk_swap - byte swap and char encoding swap of RBBI data
  255. //
  256. //-----------------------------------------------------------------------------
  257. U_CAPI int32_t U_EXPORT2
  258. ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
  259. UErrorCode *status) {
  260. if (status == nullptr || U_FAILURE(*status)) {
  261. return 0;
  262. }
  263. if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
  264. *status=U_ILLEGAL_ARGUMENT_ERROR;
  265. return 0;
  266. }
  267. //
  268. // Check that the data header is for for break data.
  269. // (Header contents are defined in genbrk.cpp)
  270. //
  271. const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
  272. if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */
  273. pInfo->dataFormat[1]==0x72 &&
  274. pInfo->dataFormat[2]==0x6b &&
  275. pInfo->dataFormat[3]==0x20 &&
  276. RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) {
  277. udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
  278. pInfo->dataFormat[0], pInfo->dataFormat[1],
  279. pInfo->dataFormat[2], pInfo->dataFormat[3],
  280. pInfo->formatVersion[0]);
  281. *status=U_UNSUPPORTED_ERROR;
  282. return 0;
  283. }
  284. //
  285. // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific
  286. // RBBIDataHeader). This swap also conveniently gets us
  287. // the size of the ICU d.h., which lets us locate the start
  288. // of the RBBI specific data.
  289. //
  290. int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
  291. //
  292. // Get the RRBI Data Header, and check that it appears to be OK.
  293. //
  294. const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
  295. RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
  296. if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
  297. !RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) ||
  298. ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) {
  299. udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
  300. *status=U_UNSUPPORTED_ERROR;
  301. return 0;
  302. }
  303. //
  304. // Prefight operation? Just return the size
  305. //
  306. int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
  307. int32_t totalSize = headerSize + breakDataLength;
  308. if (length < 0) {
  309. return totalSize;
  310. }
  311. //
  312. // Check that length passed in is consistent with length from RBBI data header.
  313. //
  314. if (length < totalSize) {
  315. udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
  316. breakDataLength);
  317. *status=U_INDEX_OUTOFBOUNDS_ERROR;
  318. return 0;
  319. }
  320. //
  321. // Swap the Data. Do the data itself first, then the RBBI Data Header, because
  322. // we need to reference the header to locate the data, and an
  323. // inplace swap of the header leaves it unusable.
  324. //
  325. uint8_t *outBytes = (uint8_t *)outData + headerSize;
  326. RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes;
  327. int32_t tableStartOffset;
  328. int32_t tableLength;
  329. //
  330. // If not swapping in place, zero out the output buffer before starting.
  331. // Individual tables and other data items within are aligned to 8 byte boundaries
  332. // when originally created. Any unused space between items needs to be zero.
  333. //
  334. if (inBytes != outBytes) {
  335. uprv_memset(outBytes, 0, breakDataLength);
  336. }
  337. //
  338. // Each state table begins with several 32 bit fields. Calculate the size
  339. // in bytes of these.
  340. //
  341. int32_t topSize = offsetof(RBBIStateTable, fTableData);
  342. // Forward state table.
  343. tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
  344. tableLength = ds->readUInt32(rbbiDH->fFTableLen);
  345. if (tableLength > 0) {
  346. RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
  347. UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
  348. ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
  349. outBytes+tableStartOffset, status);
  350. // Swap the state table if the table is in 16 bits.
  351. if (use8Bits) {
  352. if (outBytes != inBytes) {
  353. uprv_memmove(outBytes+tableStartOffset+topSize,
  354. inBytes+tableStartOffset+topSize,
  355. tableLength-topSize);
  356. }
  357. } else {
  358. ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
  359. outBytes+tableStartOffset+topSize, status);
  360. }
  361. }
  362. // Reverse state table. Same layout as forward table, above.
  363. tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
  364. tableLength = ds->readUInt32(rbbiDH->fRTableLen);
  365. if (tableLength > 0) {
  366. RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset);
  367. UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS;
  368. ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
  369. outBytes+tableStartOffset, status);
  370. // Swap the state table if the table is in 16 bits.
  371. if (use8Bits) {
  372. if (outBytes != inBytes) {
  373. uprv_memmove(outBytes+tableStartOffset+topSize,
  374. inBytes+tableStartOffset+topSize,
  375. tableLength-topSize);
  376. }
  377. } else {
  378. ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
  379. outBytes+tableStartOffset+topSize, status);
  380. }
  381. }
  382. // Trie table for character categories
  383. ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
  384. outBytes+ds->readUInt32(rbbiDH->fTrie), status);
  385. // Source Rules Text. It's UTF8 data
  386. if (outBytes != inBytes) {
  387. uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource),
  388. inBytes+ds->readUInt32(rbbiDH->fRuleSource),
  389. ds->readUInt32(rbbiDH->fRuleSourceLen));
  390. }
  391. // Table of rule status values. It's all int_32 values
  392. ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
  393. outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
  394. // And, last, the header.
  395. // It is all int32_t values except for fFormataVersion, which is an array of four bytes.
  396. // Swap the whole thing as int32_t, then re-swap the one field.
  397. //
  398. ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
  399. ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
  400. return totalSize;
  401. }
  402. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */