propsvec.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2011, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: propsvec.c
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002feb22
  16. * created by: Markus W. Scherer
  17. *
  18. * Store bits (Unicode character properties) in bit set vectors.
  19. */
  20. #include <stdlib.h>
  21. #include "unicode/utypes.h"
  22. #include "cmemory.h"
  23. #include "utrie.h"
  24. #include "utrie2.h"
  25. #include "uarrsort.h"
  26. #include "propsvec.h"
  27. #include "uassert.h"
  28. struct UPropsVectors {
  29. uint32_t *v;
  30. int32_t columns; /* number of columns, plus two for start & limit values */
  31. int32_t maxRows;
  32. int32_t rows;
  33. int32_t prevRow; /* search optimization: remember last row seen */
  34. UBool isCompacted;
  35. };
  36. #define UPVEC_INITIAL_ROWS (1<<12)
  37. #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
  38. #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
  39. U_CAPI UPropsVectors * U_EXPORT2
  40. upvec_open(int32_t columns, UErrorCode *pErrorCode) {
  41. UPropsVectors *pv;
  42. uint32_t *v, *row;
  43. uint32_t cp;
  44. if(U_FAILURE(*pErrorCode)) {
  45. return nullptr;
  46. }
  47. if(columns<1) {
  48. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  49. return nullptr;
  50. }
  51. columns+=2; /* count range start and limit columns */
  52. pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
  53. v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
  54. if(pv==nullptr || v==nullptr) {
  55. uprv_free(pv);
  56. uprv_free(v);
  57. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  58. return nullptr;
  59. }
  60. uprv_memset(pv, 0, sizeof(UPropsVectors));
  61. pv->v=v;
  62. pv->columns=columns;
  63. pv->maxRows=UPVEC_INITIAL_ROWS;
  64. pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
  65. /* set the all-Unicode row and the special-value rows */
  66. row=pv->v;
  67. uprv_memset(row, 0, pv->rows*columns*4);
  68. row[0]=0;
  69. row[1]=0x110000;
  70. row+=columns;
  71. for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
  72. row[0]=cp;
  73. row[1]=cp+1;
  74. row+=columns;
  75. }
  76. return pv;
  77. }
  78. U_CAPI void U_EXPORT2
  79. upvec_close(UPropsVectors *pv) {
  80. if(pv!=nullptr) {
  81. uprv_free(pv->v);
  82. uprv_free(pv);
  83. }
  84. }
  85. static uint32_t *
  86. _findRow(UPropsVectors *pv, UChar32 rangeStart) {
  87. uint32_t *row;
  88. int32_t columns, i, start, limit, prevRow;
  89. columns=pv->columns;
  90. limit=pv->rows;
  91. prevRow=pv->prevRow;
  92. /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
  93. row=pv->v+prevRow*columns;
  94. if (rangeStart >= static_cast<UChar32>(row[0])) {
  95. if (rangeStart < static_cast<UChar32>(row[1])) {
  96. /* same row as last seen */
  97. return row;
  98. } else if (rangeStart < static_cast<UChar32>((row += columns)[1])) {
  99. /* next row after the last one */
  100. pv->prevRow=prevRow+1;
  101. return row;
  102. } else if (rangeStart < static_cast<UChar32>((row += columns)[1])) {
  103. /* second row after the last one */
  104. pv->prevRow=prevRow+2;
  105. return row;
  106. } else if ((rangeStart - static_cast<UChar32>(row[1])) < 10) {
  107. /* we are close, continue looping */
  108. prevRow+=2;
  109. do {
  110. ++prevRow;
  111. row+=columns;
  112. } while (rangeStart >= static_cast<UChar32>(row[1]));
  113. pv->prevRow=prevRow;
  114. return row;
  115. }
  116. } else if (rangeStart < static_cast<UChar32>(pv->v[1])) {
  117. /* the very first row */
  118. pv->prevRow=0;
  119. return pv->v;
  120. }
  121. /* do a binary search for the start of the range */
  122. start=0;
  123. while(start<limit-1) {
  124. i=(start+limit)/2;
  125. row=pv->v+i*columns;
  126. if (rangeStart < static_cast<UChar32>(row[0])) {
  127. limit=i;
  128. } else if (rangeStart < static_cast<UChar32>(row[1])) {
  129. pv->prevRow=i;
  130. return row;
  131. } else {
  132. start=i;
  133. }
  134. }
  135. /* must be found because all ranges together always cover all of Unicode */
  136. pv->prevRow=start;
  137. return pv->v+start*columns;
  138. }
  139. U_CAPI void U_EXPORT2
  140. upvec_setValue(UPropsVectors *pv,
  141. UChar32 start, UChar32 end,
  142. int32_t column,
  143. uint32_t value, uint32_t mask,
  144. UErrorCode *pErrorCode) {
  145. uint32_t *firstRow, *lastRow;
  146. int32_t columns;
  147. UChar32 limit;
  148. UBool splitFirstRow, splitLastRow;
  149. /* argument checking */
  150. if(U_FAILURE(*pErrorCode)) {
  151. return;
  152. }
  153. if( pv==nullptr ||
  154. start<0 || start>end || end>UPVEC_MAX_CP ||
  155. column<0 || column>=(pv->columns-2)
  156. ) {
  157. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  158. return;
  159. }
  160. if(pv->isCompacted) {
  161. *pErrorCode=U_NO_WRITE_PERMISSION;
  162. return;
  163. }
  164. limit=end+1;
  165. /* initialize */
  166. columns=pv->columns;
  167. column+=2; /* skip range start and limit columns */
  168. value&=mask;
  169. /* find the rows whose ranges overlap with the input range */
  170. /* find the first and last rows, always successful */
  171. firstRow=_findRow(pv, start);
  172. lastRow=_findRow(pv, end);
  173. /*
  174. * Rows need to be split if they partially overlap with the
  175. * input range (only possible for the first and last rows)
  176. * and if their value differs from the input value.
  177. */
  178. splitFirstRow = start != static_cast<UChar32>(firstRow[0]) && value != (firstRow[column] & mask);
  179. splitLastRow = limit != static_cast<UChar32>(lastRow[1]) && value != (lastRow[column] & mask);
  180. /* split first/last rows if necessary */
  181. if(splitFirstRow || splitLastRow) {
  182. int32_t count, rows;
  183. rows=pv->rows;
  184. if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
  185. uint32_t *newVectors;
  186. int32_t newMaxRows;
  187. if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
  188. newMaxRows=UPVEC_MEDIUM_ROWS;
  189. } else if(pv->maxRows<UPVEC_MAX_ROWS) {
  190. newMaxRows=UPVEC_MAX_ROWS;
  191. } else {
  192. /* Implementation bug, or UPVEC_MAX_ROWS too low. */
  193. *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
  194. return;
  195. }
  196. newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
  197. if(newVectors==nullptr) {
  198. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  199. return;
  200. }
  201. uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
  202. firstRow=newVectors+(firstRow-pv->v);
  203. lastRow=newVectors+(lastRow-pv->v);
  204. uprv_free(pv->v);
  205. pv->v=newVectors;
  206. pv->maxRows=newMaxRows;
  207. }
  208. /* count the number of row cells to move after the last row, and move them */
  209. count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
  210. if(count>0) {
  211. uprv_memmove(
  212. lastRow+(1+splitFirstRow+splitLastRow)*columns,
  213. lastRow+columns,
  214. count*4);
  215. }
  216. pv->rows=rows+splitFirstRow+splitLastRow;
  217. /* split the first row, and move the firstRow pointer to the second part */
  218. if(splitFirstRow) {
  219. /* copy all affected rows up one and move the lastRow pointer */
  220. count = (int32_t)((lastRow-firstRow)+columns);
  221. uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
  222. lastRow+=columns;
  223. /* split the range and move the firstRow pointer */
  224. firstRow[1]=firstRow[columns]=(uint32_t)start;
  225. firstRow+=columns;
  226. }
  227. /* split the last row */
  228. if(splitLastRow) {
  229. /* copy the last row data */
  230. uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
  231. /* split the range and move the firstRow pointer */
  232. lastRow[1]=lastRow[columns]=(uint32_t)limit;
  233. }
  234. }
  235. /* set the "row last seen" to the last row for the range */
  236. pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
  237. /* set the input value in all remaining rows */
  238. firstRow+=column;
  239. lastRow+=column;
  240. mask=~mask;
  241. for(;;) {
  242. *firstRow=(*firstRow&mask)|value;
  243. if(firstRow==lastRow) {
  244. break;
  245. }
  246. firstRow+=columns;
  247. }
  248. }
  249. U_CAPI uint32_t U_EXPORT2
  250. upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
  251. uint32_t *row;
  252. UPropsVectors *ncpv;
  253. if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
  254. return 0;
  255. }
  256. ncpv=(UPropsVectors *)pv;
  257. row=_findRow(ncpv, c);
  258. return row[2+column];
  259. }
  260. U_CAPI uint32_t * U_EXPORT2
  261. upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
  262. UChar32 *pRangeStart, UChar32 *pRangeEnd) {
  263. uint32_t *row;
  264. int32_t columns;
  265. if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
  266. return nullptr;
  267. }
  268. columns=pv->columns;
  269. row=pv->v+rowIndex*columns;
  270. if(pRangeStart!=nullptr) {
  271. *pRangeStart=(UChar32)row[0];
  272. }
  273. if(pRangeEnd!=nullptr) {
  274. *pRangeEnd=(UChar32)row[1]-1;
  275. }
  276. return row+2;
  277. }
  278. static int32_t U_CALLCONV
  279. upvec_compareRows(const void *context, const void *l, const void *r) {
  280. const uint32_t* left = static_cast<const uint32_t*>(l), *right = static_cast<const uint32_t*>(r);
  281. const UPropsVectors* pv = static_cast<const UPropsVectors*>(context);
  282. int32_t i, count, columns;
  283. count=columns=pv->columns; /* includes start/limit columns */
  284. /* start comparing after start/limit but wrap around to them */
  285. i=2;
  286. do {
  287. if(left[i]!=right[i]) {
  288. return left[i]<right[i] ? -1 : 1;
  289. }
  290. if(++i==columns) {
  291. i=0;
  292. }
  293. } while(--count>0);
  294. return 0;
  295. }
  296. U_CAPI void U_EXPORT2
  297. upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
  298. uint32_t *row;
  299. int32_t i, columns, valueColumns, rows, count;
  300. UChar32 start, limit;
  301. /* argument checking */
  302. if(U_FAILURE(*pErrorCode)) {
  303. return;
  304. }
  305. if(handler==nullptr) {
  306. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  307. return;
  308. }
  309. if(pv->isCompacted) {
  310. return;
  311. }
  312. /* Set the flag now: Sorting and compacting destroys the builder data structure. */
  313. pv->isCompacted=true;
  314. rows=pv->rows;
  315. columns=pv->columns;
  316. U_ASSERT(columns>=3); /* upvec_open asserts this */
  317. valueColumns=columns-2; /* not counting start & limit */
  318. /* sort the properties vectors to find unique vector values */
  319. uprv_sortArray(pv->v, rows, columns*4,
  320. upvec_compareRows, pv, false, pErrorCode);
  321. if(U_FAILURE(*pErrorCode)) {
  322. return;
  323. }
  324. /*
  325. * Find and set the special values.
  326. * This has to do almost the same work as the compaction below,
  327. * to find the indexes where the special-value rows will move.
  328. */
  329. row=pv->v;
  330. count=-valueColumns;
  331. for(i=0; i<rows; ++i) {
  332. start=(UChar32)row[0];
  333. /* count a new values vector if it is different from the current one */
  334. if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
  335. count+=valueColumns;
  336. }
  337. if(start>=UPVEC_FIRST_SPECIAL_CP) {
  338. handler(context, start, start, count, row+2, valueColumns, pErrorCode);
  339. if(U_FAILURE(*pErrorCode)) {
  340. return;
  341. }
  342. }
  343. row+=columns;
  344. }
  345. /* count is at the beginning of the last vector, add valueColumns to include that last vector */
  346. count+=valueColumns;
  347. /* Call the handler once more to signal the start of delivering real values. */
  348. handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
  349. count, row-valueColumns, valueColumns, pErrorCode);
  350. if(U_FAILURE(*pErrorCode)) {
  351. return;
  352. }
  353. /*
  354. * Move vector contents up to a contiguous array with only unique
  355. * vector values, and call the handler function for each vector.
  356. *
  357. * This destroys the Properties Vector structure and replaces it
  358. * with an array of just vector values.
  359. */
  360. row=pv->v;
  361. count=-valueColumns;
  362. for(i=0; i<rows; ++i) {
  363. /* fetch these first before memmove() may overwrite them */
  364. start=(UChar32)row[0];
  365. limit=(UChar32)row[1];
  366. /* add a new values vector if it is different from the current one */
  367. if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
  368. count+=valueColumns;
  369. uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
  370. }
  371. if(start<UPVEC_FIRST_SPECIAL_CP) {
  372. handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
  373. if(U_FAILURE(*pErrorCode)) {
  374. return;
  375. }
  376. }
  377. row+=columns;
  378. }
  379. /* count is at the beginning of the last vector, add one to include that last vector */
  380. pv->rows=count/valueColumns+1;
  381. }
  382. U_CAPI const uint32_t * U_EXPORT2
  383. upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
  384. if(!pv->isCompacted) {
  385. return nullptr;
  386. }
  387. if(pRows!=nullptr) {
  388. *pRows=pv->rows;
  389. }
  390. if(pColumns!=nullptr) {
  391. *pColumns=pv->columns-2;
  392. }
  393. return pv->v;
  394. }
  395. U_CAPI uint32_t * U_EXPORT2
  396. upvec_cloneArray(const UPropsVectors *pv,
  397. int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
  398. uint32_t *clonedArray;
  399. int32_t byteLength;
  400. if(U_FAILURE(*pErrorCode)) {
  401. return nullptr;
  402. }
  403. if(!pv->isCompacted) {
  404. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  405. return nullptr;
  406. }
  407. byteLength=pv->rows*(pv->columns-2)*4;
  408. clonedArray=(uint32_t *)uprv_malloc(byteLength);
  409. if(clonedArray==nullptr) {
  410. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  411. return nullptr;
  412. }
  413. uprv_memcpy(clonedArray, pv->v, byteLength);
  414. if(pRows!=nullptr) {
  415. *pRows=pv->rows;
  416. }
  417. if(pColumns!=nullptr) {
  418. *pColumns=pv->columns-2;
  419. }
  420. return clonedArray;
  421. }
  422. U_CAPI UTrie2 * U_EXPORT2
  423. upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
  424. UPVecToUTrie2Context toUTrie2={ nullptr, 0, 0, 0 };
  425. upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
  426. utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
  427. if(U_FAILURE(*pErrorCode)) {
  428. utrie2_close(toUTrie2.trie);
  429. toUTrie2.trie=nullptr;
  430. }
  431. return toUTrie2.trie;
  432. }
  433. /*
  434. * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
  435. * some 16-bit field and builds and returns a UTrie2.
  436. */
  437. U_CAPI void U_CALLCONV
  438. upvec_compactToUTrie2Handler(void *context,
  439. UChar32 start, UChar32 end,
  440. int32_t rowIndex, uint32_t *row, int32_t columns,
  441. UErrorCode *pErrorCode) {
  442. (void)row;
  443. (void)columns;
  444. UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
  445. if(start<UPVEC_FIRST_SPECIAL_CP) {
  446. utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, true, pErrorCode);
  447. } else {
  448. switch(start) {
  449. case UPVEC_INITIAL_VALUE_CP:
  450. toUTrie2->initialValue=rowIndex;
  451. break;
  452. case UPVEC_ERROR_VALUE_CP:
  453. toUTrie2->errorValue=rowIndex;
  454. break;
  455. case UPVEC_START_REAL_VALUES_CP:
  456. toUTrie2->maxValue=rowIndex;
  457. if(rowIndex>0xffff) {
  458. /* too many rows for a 16-bit trie */
  459. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  460. } else {
  461. toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
  462. toUTrie2->errorValue, pErrorCode);
  463. }
  464. break;
  465. default:
  466. break;
  467. }
  468. }
  469. }