unisetspan.cpp 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2007-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: unisetspan.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2007mar01
  16. * created by: Markus W. Scherer
  17. */
  18. #include "unicode/utypes.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/ustring.h"
  21. #include "unicode/utf8.h"
  22. #include "unicode/utf16.h"
  23. #include "cmemory.h"
  24. #include "uvector.h"
  25. #include "unisetspan.h"
  26. U_NAMESPACE_BEGIN
  27. /*
  28. * List of offsets from the current position from where to try matching
  29. * a code point or a string.
  30. * Store offsets rather than indexes to simplify the code and use the same list
  31. * for both increments (in span()) and decrements (in spanBack()).
  32. *
  33. * Assumption: The maximum offset is limited, and the offsets that are stored
  34. * at any one time are relatively dense, that is, there are normally no gaps of
  35. * hundreds or thousands of offset values.
  36. *
  37. * The implementation uses a circular buffer of byte flags,
  38. * each indicating whether the corresponding offset is in the list.
  39. * This avoids inserting into a sorted list of offsets (or absolute indexes) and
  40. * physically moving part of the list.
  41. *
  42. * Note: In principle, the caller should setMaxLength() to the maximum of the
  43. * max string length and U16_LENGTH/U8_LENGTH to account for
  44. * "long" single code points.
  45. * However, this implementation uses at least a staticList with more than
  46. * U8_LENGTH entries anyway.
  47. *
  48. * Note: If maxLength were guaranteed to be no more than 32 or 64,
  49. * the list could be stored as bit flags in a single integer.
  50. * Rather than handling a circular buffer with a start list index,
  51. * the integer would simply be shifted when lower offsets are removed.
  52. * UnicodeSet does not have a limit on the lengths of strings.
  53. */
  54. class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
  55. public:
  56. OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
  57. ~OffsetList() {
  58. if(list!=staticList) {
  59. uprv_free(list);
  60. }
  61. }
  62. // Call exactly once if the list is to be used.
  63. void setMaxLength(int32_t maxLength) {
  64. if (maxLength <= static_cast<int32_t>(sizeof(staticList))) {
  65. capacity = static_cast<int32_t>(sizeof(staticList));
  66. } else {
  67. UBool* l = static_cast<UBool*>(uprv_malloc(maxLength));
  68. if(l!=nullptr) {
  69. list=l;
  70. capacity=maxLength;
  71. }
  72. }
  73. uprv_memset(list, 0, capacity);
  74. }
  75. void clear() {
  76. uprv_memset(list, 0, capacity);
  77. start=length=0;
  78. }
  79. UBool isEmpty() const {
  80. return length == 0;
  81. }
  82. // Reduce all stored offsets by delta, used when the current position
  83. // moves by delta.
  84. // There must not be any offsets lower than delta.
  85. // If there is an offset equal to delta, it is removed.
  86. // delta=[1..maxLength]
  87. void shift(int32_t delta) {
  88. int32_t i=start+delta;
  89. if(i>=capacity) {
  90. i-=capacity;
  91. }
  92. if(list[i]) {
  93. list[i]=false;
  94. --length;
  95. }
  96. start=i;
  97. }
  98. // Add an offset. The list must not contain it yet.
  99. // offset=[1..maxLength]
  100. void addOffset(int32_t offset) {
  101. int32_t i=start+offset;
  102. if(i>=capacity) {
  103. i-=capacity;
  104. }
  105. list[i]=true;
  106. ++length;
  107. }
  108. // offset=[1..maxLength]
  109. UBool containsOffset(int32_t offset) const {
  110. int32_t i=start+offset;
  111. if(i>=capacity) {
  112. i-=capacity;
  113. }
  114. return list[i];
  115. }
  116. // Find the lowest stored offset from a non-empty list, remove it,
  117. // and reduce all other offsets by this minimum.
  118. // Returns [1..maxLength].
  119. int32_t popMinimum() {
  120. // Look for the next offset in list[start+1..capacity-1].
  121. int32_t i=start, result;
  122. while(++i<capacity) {
  123. if(list[i]) {
  124. list[i]=false;
  125. --length;
  126. result=i-start;
  127. start=i;
  128. return result;
  129. }
  130. }
  131. // i==capacity
  132. // Wrap around and look for the next offset in list[0..start].
  133. // Since the list is not empty, there will be one.
  134. result=capacity-start;
  135. i=0;
  136. while(!list[i]) {
  137. ++i;
  138. }
  139. list[i]=false;
  140. --length;
  141. start=i;
  142. return result+=i;
  143. }
  144. private:
  145. UBool *list;
  146. int32_t capacity;
  147. int32_t length;
  148. int32_t start;
  149. UBool staticList[16];
  150. };
  151. // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
  152. static int32_t
  153. getUTF8Length(const char16_t *s, int32_t length) {
  154. UErrorCode errorCode=U_ZERO_ERROR;
  155. int32_t length8=0;
  156. u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode);
  157. if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
  158. return length8;
  159. } else {
  160. // The string contains an unpaired surrogate.
  161. // Ignore this string.
  162. return 0;
  163. }
  164. }
  165. // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
  166. static int32_t
  167. appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) {
  168. UErrorCode errorCode=U_ZERO_ERROR;
  169. int32_t length8=0;
  170. u_strToUTF8(reinterpret_cast<char*>(t), capacity, &length8, s, length, &errorCode);
  171. if(U_SUCCESS(errorCode)) {
  172. return length8;
  173. } else {
  174. // The string contains an unpaired surrogate.
  175. // Ignore this string.
  176. return 0;
  177. }
  178. }
  179. static inline uint8_t
  180. makeSpanLengthByte(int32_t spanLength) {
  181. // 0xfe==UnicodeSetStringSpan::LONG_SPAN
  182. return spanLength < 0xfe ? static_cast<uint8_t>(spanLength) : static_cast<uint8_t>(0xfe);
  183. }
  184. // Construct for all variants of span(), or only for any one variant.
  185. // Initialize as little as possible, for single use.
  186. UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
  187. const UVector &setStrings,
  188. uint32_t which)
  189. : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
  190. utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
  191. utf8Length(0),
  192. maxLength16(0), maxLength8(0),
  193. all(static_cast<UBool>(which == ALL)) {
  194. spanSet.retainAll(set);
  195. if(which&NOT_CONTAINED) {
  196. // Default to the same sets.
  197. // addToSpanNotSet() will create a separate set if necessary.
  198. pSpanNotSet=&spanSet;
  199. }
  200. // Determine if the strings even need to be taken into account at all for span() etc.
  201. // If any string is relevant, then all strings need to be used for
  202. // span(longest match) but only the relevant ones for span(while contained).
  203. // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
  204. // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
  205. // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
  206. // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
  207. int32_t stringsLength=strings.size();
  208. int32_t i, spanLength;
  209. UBool someRelevant=false;
  210. for(i=0; i<stringsLength; ++i) {
  211. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  212. const char16_t *s16=string.getBuffer();
  213. int32_t length16=string.length();
  214. if (length16==0) {
  215. continue; // skip the empty string
  216. }
  217. UBool thisRelevant;
  218. spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
  219. if(spanLength<length16) { // Relevant string.
  220. someRelevant=thisRelevant=true;
  221. } else {
  222. thisRelevant=false;
  223. }
  224. if((which&UTF16) && length16>maxLength16) {
  225. maxLength16=length16;
  226. }
  227. if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
  228. int32_t length8=getUTF8Length(s16, length16);
  229. utf8Length+=length8;
  230. if(length8>maxLength8) {
  231. maxLength8=length8;
  232. }
  233. }
  234. }
  235. if(!someRelevant) {
  236. maxLength16=maxLength8=0;
  237. return;
  238. }
  239. // Freeze after checking for the need to use strings at all because freezing
  240. // a set takes some time and memory which are wasted if there are no relevant strings.
  241. if(all) {
  242. spanSet.freeze();
  243. }
  244. uint8_t *spanBackLengths;
  245. uint8_t *spanUTF8Lengths;
  246. uint8_t *spanBackUTF8Lengths;
  247. // Allocate a block of meta data.
  248. int32_t allocSize;
  249. if(all) {
  250. // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
  251. allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
  252. } else {
  253. allocSize=stringsLength; // One set of span lengths.
  254. if(which&UTF8) {
  255. // UTF-8 lengths and UTF-8 strings.
  256. allocSize+=stringsLength*4+utf8Length;
  257. }
  258. }
  259. if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) {
  260. utf8Lengths=staticLengths;
  261. } else {
  262. utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize));
  263. if(utf8Lengths==nullptr) {
  264. maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
  265. return; // Out of memory.
  266. }
  267. }
  268. if(all) {
  269. // Store span lengths for all span() variants.
  270. spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
  271. spanBackLengths=spanLengths+stringsLength;
  272. spanUTF8Lengths=spanBackLengths+stringsLength;
  273. spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
  274. utf8=spanBackUTF8Lengths+stringsLength;
  275. } else {
  276. // Store span lengths for only one span() variant.
  277. if(which&UTF8) {
  278. spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
  279. utf8=spanLengths+stringsLength;
  280. } else {
  281. spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths);
  282. }
  283. spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
  284. }
  285. // Set the meta data and pSpanNotSet and write the UTF-8 strings.
  286. int32_t utf8Count=0; // Count UTF-8 bytes written so far.
  287. for(i=0; i<stringsLength; ++i) {
  288. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  289. const char16_t *s16=string.getBuffer();
  290. int32_t length16=string.length();
  291. spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
  292. if(spanLength<length16 && length16>0) { // Relevant string.
  293. if(which&UTF16) {
  294. if(which&CONTAINED) {
  295. if(which&FWD) {
  296. spanLengths[i]=makeSpanLengthByte(spanLength);
  297. }
  298. if(which&BACK) {
  299. spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
  300. spanBackLengths[i]=makeSpanLengthByte(spanLength);
  301. }
  302. } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
  303. spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
  304. }
  305. }
  306. if(which&UTF8) {
  307. uint8_t *s8=utf8+utf8Count;
  308. int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
  309. utf8Count+=utf8Lengths[i]=length8;
  310. if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
  311. spanUTF8Lengths[i] = spanBackUTF8Lengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED);
  312. } else { // Relevant for UTF-8.
  313. if(which&CONTAINED) {
  314. if(which&FWD) {
  315. spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED);
  316. spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
  317. }
  318. if(which&BACK) {
  319. spanLength = length8 - spanSet.spanBackUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED);
  320. spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
  321. }
  322. } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
  323. spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
  324. }
  325. }
  326. }
  327. if(which&NOT_CONTAINED) {
  328. // Add string start and end code points to the spanNotSet so that
  329. // a span(while not contained) stops before any string.
  330. UChar32 c;
  331. if(which&FWD) {
  332. int32_t len=0;
  333. U16_NEXT(s16, len, length16, c);
  334. addToSpanNotSet(c);
  335. }
  336. if(which&BACK) {
  337. int32_t len=length16;
  338. U16_PREV(s16, 0, len, c);
  339. addToSpanNotSet(c);
  340. }
  341. }
  342. } else { // Irrelevant string. (Also the empty string.)
  343. if(which&UTF8) {
  344. if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
  345. uint8_t *s8=utf8+utf8Count;
  346. int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
  347. utf8Count+=utf8Lengths[i]=length8;
  348. } else {
  349. utf8Lengths[i]=0;
  350. }
  351. }
  352. if(all) {
  353. spanLengths[i]=spanBackLengths[i]=
  354. spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
  355. static_cast<uint8_t>(ALL_CP_CONTAINED);
  356. } else {
  357. // All spanXYZLengths pointers contain the same address.
  358. spanLengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED);
  359. }
  360. }
  361. }
  362. // Finish.
  363. if(all) {
  364. pSpanNotSet->freeze();
  365. }
  366. }
  367. // Copy constructor. Assumes which==ALL for a frozen set.
  368. UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
  369. const UVector &newParentSetStrings)
  370. : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
  371. utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
  372. utf8Length(otherStringSpan.utf8Length),
  373. maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
  374. all(true) {
  375. if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
  376. pSpanNotSet=&spanSet;
  377. } else {
  378. pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
  379. }
  380. // Allocate a block of meta data.
  381. // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
  382. int32_t stringsLength=strings.size();
  383. int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
  384. if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) {
  385. utf8Lengths=staticLengths;
  386. } else {
  387. utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize));
  388. if(utf8Lengths==nullptr) {
  389. maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false.
  390. return; // Out of memory.
  391. }
  392. }
  393. spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
  394. utf8=spanLengths+stringsLength*4;
  395. uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
  396. }
  397. UnicodeSetStringSpan::~UnicodeSetStringSpan() {
  398. if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
  399. delete pSpanNotSet;
  400. }
  401. if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
  402. uprv_free(utf8Lengths);
  403. }
  404. }
  405. void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
  406. if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) {
  407. if(spanSet.contains(c)) {
  408. return; // Nothing to do.
  409. }
  410. UnicodeSet *newSet=spanSet.cloneAsThawed();
  411. if(newSet==nullptr) {
  412. return; // Out of memory.
  413. } else {
  414. pSpanNotSet=newSet;
  415. }
  416. }
  417. pSpanNotSet->add(c);
  418. }
  419. // Compare strings without any argument checks. Requires length>0.
  420. static inline UBool
  421. matches16(const char16_t *s, const char16_t *t, int32_t length) {
  422. do {
  423. if(*s++!=*t++) {
  424. return false;
  425. }
  426. } while(--length>0);
  427. return true;
  428. }
  429. static inline UBool
  430. matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
  431. do {
  432. if(*s++!=*t++) {
  433. return false;
  434. }
  435. } while(--length>0);
  436. return true;
  437. }
  438. // Compare 16-bit Unicode strings (which may be malformed UTF-16)
  439. // at code point boundaries.
  440. // That is, each edge of a match must not be in the middle of a surrogate pair.
  441. static inline UBool
  442. matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
  443. s+=start;
  444. limit-=start;
  445. return matches16(s, t, length) &&
  446. !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
  447. !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
  448. }
  449. // Does the set contain the next code point?
  450. // If so, return its length; otherwise return its negative length.
  451. static inline int32_t
  452. spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
  453. char16_t c=*s, c2;
  454. if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
  455. return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
  456. }
  457. return set.contains(c) ? 1 : -1;
  458. }
  459. static inline int32_t
  460. spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
  461. char16_t c=s[length-1], c2;
  462. if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
  463. return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
  464. }
  465. return set.contains(c) ? 1 : -1;
  466. }
  467. static inline int32_t
  468. spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
  469. UChar32 c=*s;
  470. if(U8_IS_SINGLE(c)) {
  471. return set.contains(c) ? 1 : -1;
  472. }
  473. // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
  474. int32_t i=0;
  475. U8_NEXT_OR_FFFD(s, i, length, c);
  476. return set.contains(c) ? i : -i;
  477. }
  478. static inline int32_t
  479. spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
  480. UChar32 c=s[length-1];
  481. if(U8_IS_SINGLE(c)) {
  482. return set.contains(c) ? 1 : -1;
  483. }
  484. int32_t i=length-1;
  485. c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
  486. length-=i;
  487. return set.contains(c) ? length : -length;
  488. }
  489. /*
  490. * Note: In span() when spanLength==0 (after a string match, or at the beginning
  491. * after an empty code point span) and in spanNot() and spanNotUTF8(),
  492. * string matching could use a binary search
  493. * because all string matches are done from the same start index.
  494. *
  495. * For UTF-8, this would require a comparison function that returns UTF-16 order.
  496. *
  497. * This optimization should not be necessary for normal UnicodeSets because
  498. * most sets have no strings, and most sets with strings have
  499. * very few very short strings.
  500. * For cases with many strings, it might be better to use a different API
  501. * and implementation with a DFA (state machine).
  502. */
  503. /*
  504. * Algorithm for span(USET_SPAN_CONTAINED)
  505. *
  506. * Theoretical algorithm:
  507. * - Iterate through the string, and at each code point boundary:
  508. * + If the code point there is in the set, then remember to continue after it.
  509. * + If a set string matches at the current position, then remember to continue after it.
  510. * + Either recursively span for each code point or string match,
  511. * or recursively span for all but the shortest one and
  512. * iteratively continue the span with the shortest local match.
  513. * + Remember the longest recursive span (the farthest end point).
  514. * + If there is no match at the current position, neither for the code point there
  515. * nor for any set string, then stop and return the longest recursive span length.
  516. *
  517. * Optimized implementation:
  518. *
  519. * (We assume that most sets will have very few very short strings.
  520. * A span using a string-less set is extremely fast.)
  521. *
  522. * Create and cache a spanSet which contains all of the single code points
  523. * of the original set but none of its strings.
  524. *
  525. * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
  526. * - Loop:
  527. * + Try to match each set string at the end of the spanLength.
  528. * ~ Set strings that start with set-contained code points must be matched
  529. * with a partial overlap because the recursive algorithm would have tried
  530. * to match them at every position.
  531. * ~ Set strings that entirely consist of set-contained code points
  532. * are irrelevant for span(USET_SPAN_CONTAINED) because the
  533. * recursive algorithm would continue after them anyway
  534. * and find the longest recursive match from their end.
  535. * ~ Rather than recursing, note each end point of a set string match.
  536. * + If no set string matched after spanSet.span(), then return
  537. * with where the spanSet.span() ended.
  538. * + If at least one set string matched after spanSet.span(), then
  539. * pop the shortest string match end point and continue
  540. * the loop, trying to match all set strings from there.
  541. * + If at least one more set string matched after a previous string match,
  542. * then test if the code point after the previous string match is also
  543. * contained in the set.
  544. * Continue the loop with the shortest end point of either this code point
  545. * or a matching set string.
  546. * + If no more set string matched after a previous string match,
  547. * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
  548. * Stop if spanLength==0, otherwise continue the loop.
  549. *
  550. * By noting each end point of a set string match,
  551. * the function visits each string position at most once and finishes
  552. * in linear time.
  553. *
  554. * The recursive algorithm may visit the same string position many times
  555. * if multiple paths lead to it and finishes in exponential time.
  556. */
  557. /*
  558. * Algorithm for span(USET_SPAN_SIMPLE)
  559. *
  560. * Theoretical algorithm:
  561. * - Iterate through the string, and at each code point boundary:
  562. * + If the code point there is in the set, then remember to continue after it.
  563. * + If a set string matches at the current position, then remember to continue after it.
  564. * + Continue from the farthest match position and ignore all others.
  565. * + If there is no match at the current position,
  566. * then stop and return the current position.
  567. *
  568. * Optimized implementation:
  569. *
  570. * (Same assumption and spanSet as above.)
  571. *
  572. * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
  573. * - Loop:
  574. * + Try to match each set string at the end of the spanLength.
  575. * ~ Set strings that start with set-contained code points must be matched
  576. * with a partial overlap because the standard algorithm would have tried
  577. * to match them earlier.
  578. * ~ Set strings that entirely consist of set-contained code points
  579. * must be matched with a full overlap because the longest-match algorithm
  580. * would hide set string matches that end earlier.
  581. * Such set strings need not be matched earlier inside the code point span
  582. * because the standard algorithm would then have continued after
  583. * the set string match anyway.
  584. * ~ Remember the longest set string match (farthest end point) from the earliest
  585. * starting point.
  586. * + If no set string matched after spanSet.span(), then return
  587. * with where the spanSet.span() ended.
  588. * + If at least one set string matched, then continue the loop after the
  589. * longest match from the earliest position.
  590. * + If no more set string matched after a previous string match,
  591. * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
  592. * Stop if spanLength==0, otherwise continue the loop.
  593. */
  594. int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
  595. if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  596. return spanNot(s, length);
  597. }
  598. int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
  599. if(spanLength==length) {
  600. return length;
  601. }
  602. // Consider strings; they may overlap with the span.
  603. OffsetList offsets;
  604. if(spanCondition==USET_SPAN_CONTAINED) {
  605. // Use offset list to try all possibilities.
  606. offsets.setMaxLength(maxLength16);
  607. }
  608. int32_t pos=spanLength, rest=length-pos;
  609. int32_t i, stringsLength=strings.size();
  610. for(;;) {
  611. if(spanCondition==USET_SPAN_CONTAINED) {
  612. for(i=0; i<stringsLength; ++i) {
  613. int32_t overlap=spanLengths[i];
  614. if(overlap==ALL_CP_CONTAINED) {
  615. continue; // Irrelevant string. (Also the empty string.)
  616. }
  617. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  618. const char16_t *s16=string.getBuffer();
  619. int32_t length16=string.length();
  620. U_ASSERT(length>0);
  621. // Try to match this string at pos-overlap..pos.
  622. if(overlap>=LONG_SPAN) {
  623. overlap=length16;
  624. // While contained: No point matching fully inside the code point span.
  625. U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
  626. }
  627. if(overlap>spanLength) {
  628. overlap=spanLength;
  629. }
  630. int32_t inc=length16-overlap; // Keep overlap+inc==length16.
  631. for(;;) {
  632. if(inc>rest) {
  633. break;
  634. }
  635. // Try to match if the increment is not listed already.
  636. if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
  637. if(inc==rest) {
  638. return length; // Reached the end of the string.
  639. }
  640. offsets.addOffset(inc);
  641. }
  642. if(overlap==0) {
  643. break;
  644. }
  645. --overlap;
  646. ++inc;
  647. }
  648. }
  649. } else /* USET_SPAN_SIMPLE */ {
  650. int32_t maxInc=0, maxOverlap=0;
  651. for(i=0; i<stringsLength; ++i) {
  652. int32_t overlap=spanLengths[i];
  653. // For longest match, we do need to try to match even an all-contained string
  654. // to find the match from the earliest start.
  655. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  656. const char16_t *s16=string.getBuffer();
  657. int32_t length16=string.length();
  658. if (length16==0) {
  659. continue; // skip the empty string
  660. }
  661. // Try to match this string at pos-overlap..pos.
  662. if(overlap>=LONG_SPAN) {
  663. overlap=length16;
  664. // Longest match: Need to match fully inside the code point span
  665. // to find the match from the earliest start.
  666. }
  667. if(overlap>spanLength) {
  668. overlap=spanLength;
  669. }
  670. int32_t inc=length16-overlap; // Keep overlap+inc==length16.
  671. for(;;) {
  672. if(inc>rest || overlap<maxOverlap) {
  673. break;
  674. }
  675. // Try to match if the string is longer or starts earlier.
  676. if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
  677. matches16CPB(s, pos-overlap, length, s16, length16)
  678. ) {
  679. maxInc=inc; // Longest match from earliest start.
  680. maxOverlap=overlap;
  681. break;
  682. }
  683. --overlap;
  684. ++inc;
  685. }
  686. }
  687. if(maxInc!=0 || maxOverlap!=0) {
  688. // Longest-match algorithm, and there was a string match.
  689. // Simply continue after it.
  690. pos+=maxInc;
  691. rest-=maxInc;
  692. if(rest==0) {
  693. return length; // Reached the end of the string.
  694. }
  695. spanLength=0; // Match strings from after a string match.
  696. continue;
  697. }
  698. }
  699. // Finished trying to match all strings at pos.
  700. if(spanLength!=0 || pos==0) {
  701. // The position is after an unlimited code point span (spanLength!=0),
  702. // not after a string match.
  703. // The only position where spanLength==0 after a span is pos==0.
  704. // Otherwise, an unlimited code point span is only tried again when no
  705. // strings match, and if such a non-initial span fails we stop.
  706. if(offsets.isEmpty()) {
  707. return pos; // No strings matched after a span.
  708. }
  709. // Match strings from after the next string match.
  710. } else {
  711. // The position is after a string match (or a single code point).
  712. if(offsets.isEmpty()) {
  713. // No more strings matched after a previous string match.
  714. // Try another code point span from after the last string match.
  715. spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
  716. if( spanLength==rest || // Reached the end of the string, or
  717. spanLength==0 // neither strings nor span progressed.
  718. ) {
  719. return pos+spanLength;
  720. }
  721. pos+=spanLength;
  722. rest-=spanLength;
  723. continue; // spanLength>0: Match strings from after a span.
  724. } else {
  725. // Try to match only one code point from after a string match if some
  726. // string matched beyond it, so that we try all possible positions
  727. // and don't overshoot.
  728. spanLength=spanOne(spanSet, s+pos, rest);
  729. if(spanLength>0) {
  730. if(spanLength==rest) {
  731. return length; // Reached the end of the string.
  732. }
  733. // Match strings after this code point.
  734. // There cannot be any increments below it because UnicodeSet strings
  735. // contain multiple code points.
  736. pos+=spanLength;
  737. rest-=spanLength;
  738. offsets.shift(spanLength);
  739. spanLength=0;
  740. continue; // Match strings from after a single code point.
  741. }
  742. // Match strings from after the next string match.
  743. }
  744. }
  745. int32_t minOffset=offsets.popMinimum();
  746. pos+=minOffset;
  747. rest-=minOffset;
  748. spanLength=0; // Match strings from after a string match.
  749. }
  750. }
  751. int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
  752. if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  753. return spanNotBack(s, length);
  754. }
  755. int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
  756. if(pos==0) {
  757. return 0;
  758. }
  759. int32_t spanLength=length-pos;
  760. // Consider strings; they may overlap with the span.
  761. OffsetList offsets;
  762. if(spanCondition==USET_SPAN_CONTAINED) {
  763. // Use offset list to try all possibilities.
  764. offsets.setMaxLength(maxLength16);
  765. }
  766. int32_t i, stringsLength=strings.size();
  767. uint8_t *spanBackLengths=spanLengths;
  768. if(all) {
  769. spanBackLengths+=stringsLength;
  770. }
  771. for(;;) {
  772. if(spanCondition==USET_SPAN_CONTAINED) {
  773. for(i=0; i<stringsLength; ++i) {
  774. int32_t overlap=spanBackLengths[i];
  775. if(overlap==ALL_CP_CONTAINED) {
  776. continue; // Irrelevant string. (Also the empty string.)
  777. }
  778. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  779. const char16_t *s16=string.getBuffer();
  780. int32_t length16=string.length();
  781. U_ASSERT(length>0);
  782. // Try to match this string at pos-(length16-overlap)..pos-length16.
  783. if(overlap>=LONG_SPAN) {
  784. overlap=length16;
  785. // While contained: No point matching fully inside the code point span.
  786. int32_t len1=0;
  787. U16_FWD_1(s16, len1, overlap);
  788. overlap-=len1; // Length of the string minus the first code point.
  789. }
  790. if(overlap>spanLength) {
  791. overlap=spanLength;
  792. }
  793. int32_t dec=length16-overlap; // Keep dec+overlap==length16.
  794. for(;;) {
  795. if(dec>pos) {
  796. break;
  797. }
  798. // Try to match if the decrement is not listed already.
  799. if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
  800. if(dec==pos) {
  801. return 0; // Reached the start of the string.
  802. }
  803. offsets.addOffset(dec);
  804. }
  805. if(overlap==0) {
  806. break;
  807. }
  808. --overlap;
  809. ++dec;
  810. }
  811. }
  812. } else /* USET_SPAN_SIMPLE */ {
  813. int32_t maxDec=0, maxOverlap=0;
  814. for(i=0; i<stringsLength; ++i) {
  815. int32_t overlap=spanBackLengths[i];
  816. // For longest match, we do need to try to match even an all-contained string
  817. // to find the match from the latest end.
  818. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  819. const char16_t *s16=string.getBuffer();
  820. int32_t length16=string.length();
  821. if (length16==0) {
  822. continue; // skip the empty string
  823. }
  824. // Try to match this string at pos-(length16-overlap)..pos-length16.
  825. if(overlap>=LONG_SPAN) {
  826. overlap=length16;
  827. // Longest match: Need to match fully inside the code point span
  828. // to find the match from the latest end.
  829. }
  830. if(overlap>spanLength) {
  831. overlap=spanLength;
  832. }
  833. int32_t dec=length16-overlap; // Keep dec+overlap==length16.
  834. for(;;) {
  835. if(dec>pos || overlap<maxOverlap) {
  836. break;
  837. }
  838. // Try to match if the string is longer or ends later.
  839. if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
  840. matches16CPB(s, pos-dec, length, s16, length16)
  841. ) {
  842. maxDec=dec; // Longest match from latest end.
  843. maxOverlap=overlap;
  844. break;
  845. }
  846. --overlap;
  847. ++dec;
  848. }
  849. }
  850. if(maxDec!=0 || maxOverlap!=0) {
  851. // Longest-match algorithm, and there was a string match.
  852. // Simply continue before it.
  853. pos-=maxDec;
  854. if(pos==0) {
  855. return 0; // Reached the start of the string.
  856. }
  857. spanLength=0; // Match strings from before a string match.
  858. continue;
  859. }
  860. }
  861. // Finished trying to match all strings at pos.
  862. if(spanLength!=0 || pos==length) {
  863. // The position is before an unlimited code point span (spanLength!=0),
  864. // not before a string match.
  865. // The only position where spanLength==0 before a span is pos==length.
  866. // Otherwise, an unlimited code point span is only tried again when no
  867. // strings match, and if such a non-initial span fails we stop.
  868. if(offsets.isEmpty()) {
  869. return pos; // No strings matched before a span.
  870. }
  871. // Match strings from before the next string match.
  872. } else {
  873. // The position is before a string match (or a single code point).
  874. if(offsets.isEmpty()) {
  875. // No more strings matched before a previous string match.
  876. // Try another code point span from before the last string match.
  877. int32_t oldPos=pos;
  878. pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
  879. spanLength=oldPos-pos;
  880. if( pos==0 || // Reached the start of the string, or
  881. spanLength==0 // neither strings nor span progressed.
  882. ) {
  883. return pos;
  884. }
  885. continue; // spanLength>0: Match strings from before a span.
  886. } else {
  887. // Try to match only one code point from before a string match if some
  888. // string matched beyond it, so that we try all possible positions
  889. // and don't overshoot.
  890. spanLength=spanOneBack(spanSet, s, pos);
  891. if(spanLength>0) {
  892. if(spanLength==pos) {
  893. return 0; // Reached the start of the string.
  894. }
  895. // Match strings before this code point.
  896. // There cannot be any decrements below it because UnicodeSet strings
  897. // contain multiple code points.
  898. pos-=spanLength;
  899. offsets.shift(spanLength);
  900. spanLength=0;
  901. continue; // Match strings from before a single code point.
  902. }
  903. // Match strings from before the next string match.
  904. }
  905. }
  906. pos-=offsets.popMinimum();
  907. spanLength=0; // Match strings from before a string match.
  908. }
  909. }
  910. int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
  911. if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  912. return spanNotUTF8(s, length);
  913. }
  914. int32_t spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED);
  915. if(spanLength==length) {
  916. return length;
  917. }
  918. // Consider strings; they may overlap with the span.
  919. OffsetList offsets;
  920. if(spanCondition==USET_SPAN_CONTAINED) {
  921. // Use offset list to try all possibilities.
  922. offsets.setMaxLength(maxLength8);
  923. }
  924. int32_t pos=spanLength, rest=length-pos;
  925. int32_t i, stringsLength=strings.size();
  926. uint8_t *spanUTF8Lengths=spanLengths;
  927. if(all) {
  928. spanUTF8Lengths+=2*stringsLength;
  929. }
  930. for(;;) {
  931. const uint8_t *s8=utf8;
  932. int32_t length8;
  933. if(spanCondition==USET_SPAN_CONTAINED) {
  934. for(i=0; i<stringsLength; ++i) {
  935. length8=utf8Lengths[i];
  936. if(length8==0) {
  937. continue; // String not representable in UTF-8.
  938. }
  939. int32_t overlap=spanUTF8Lengths[i];
  940. if(overlap==ALL_CP_CONTAINED) {
  941. s8+=length8;
  942. continue; // Irrelevant string.
  943. }
  944. // Try to match this string at pos-overlap..pos.
  945. if(overlap>=LONG_SPAN) {
  946. overlap=length8;
  947. // While contained: No point matching fully inside the code point span.
  948. U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
  949. }
  950. if(overlap>spanLength) {
  951. overlap=spanLength;
  952. }
  953. int32_t inc=length8-overlap; // Keep overlap+inc==length8.
  954. for(;;) {
  955. if(inc>rest) {
  956. break;
  957. }
  958. // Try to match if the increment is not listed already.
  959. // Match at code point boundaries. (The UTF-8 strings were converted
  960. // from UTF-16 and are guaranteed to be well-formed.)
  961. if(!U8_IS_TRAIL(s[pos-overlap]) &&
  962. !offsets.containsOffset(inc) &&
  963. matches8(s+pos-overlap, s8, length8)) {
  964. if(inc==rest) {
  965. return length; // Reached the end of the string.
  966. }
  967. offsets.addOffset(inc);
  968. }
  969. if(overlap==0) {
  970. break;
  971. }
  972. --overlap;
  973. ++inc;
  974. }
  975. s8+=length8;
  976. }
  977. } else /* USET_SPAN_SIMPLE */ {
  978. int32_t maxInc=0, maxOverlap=0;
  979. for(i=0; i<stringsLength; ++i) {
  980. length8=utf8Lengths[i];
  981. if(length8==0) {
  982. continue; // String not representable in UTF-8.
  983. }
  984. int32_t overlap=spanUTF8Lengths[i];
  985. // For longest match, we do need to try to match even an all-contained string
  986. // to find the match from the earliest start.
  987. // Try to match this string at pos-overlap..pos.
  988. if(overlap>=LONG_SPAN) {
  989. overlap=length8;
  990. // Longest match: Need to match fully inside the code point span
  991. // to find the match from the earliest start.
  992. }
  993. if(overlap>spanLength) {
  994. overlap=spanLength;
  995. }
  996. int32_t inc=length8-overlap; // Keep overlap+inc==length8.
  997. for(;;) {
  998. if(inc>rest || overlap<maxOverlap) {
  999. break;
  1000. }
  1001. // Try to match if the string is longer or starts earlier.
  1002. // Match at code point boundaries. (The UTF-8 strings were converted
  1003. // from UTF-16 and are guaranteed to be well-formed.)
  1004. if(!U8_IS_TRAIL(s[pos-overlap]) &&
  1005. (overlap>maxOverlap ||
  1006. /* redundant overlap==maxOverlap && */ inc>maxInc) &&
  1007. matches8(s+pos-overlap, s8, length8)) {
  1008. maxInc=inc; // Longest match from earliest start.
  1009. maxOverlap=overlap;
  1010. break;
  1011. }
  1012. --overlap;
  1013. ++inc;
  1014. }
  1015. s8+=length8;
  1016. }
  1017. if(maxInc!=0 || maxOverlap!=0) {
  1018. // Longest-match algorithm, and there was a string match.
  1019. // Simply continue after it.
  1020. pos+=maxInc;
  1021. rest-=maxInc;
  1022. if(rest==0) {
  1023. return length; // Reached the end of the string.
  1024. }
  1025. spanLength=0; // Match strings from after a string match.
  1026. continue;
  1027. }
  1028. }
  1029. // Finished trying to match all strings at pos.
  1030. if(spanLength!=0 || pos==0) {
  1031. // The position is after an unlimited code point span (spanLength!=0),
  1032. // not after a string match.
  1033. // The only position where spanLength==0 after a span is pos==0.
  1034. // Otherwise, an unlimited code point span is only tried again when no
  1035. // strings match, and if such a non-initial span fails we stop.
  1036. if(offsets.isEmpty()) {
  1037. return pos; // No strings matched after a span.
  1038. }
  1039. // Match strings from after the next string match.
  1040. } else {
  1041. // The position is after a string match (or a single code point).
  1042. if(offsets.isEmpty()) {
  1043. // No more strings matched after a previous string match.
  1044. // Try another code point span from after the last string match.
  1045. spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_CONTAINED);
  1046. if( spanLength==rest || // Reached the end of the string, or
  1047. spanLength==0 // neither strings nor span progressed.
  1048. ) {
  1049. return pos+spanLength;
  1050. }
  1051. pos+=spanLength;
  1052. rest-=spanLength;
  1053. continue; // spanLength>0: Match strings from after a span.
  1054. } else {
  1055. // Try to match only one code point from after a string match if some
  1056. // string matched beyond it, so that we try all possible positions
  1057. // and don't overshoot.
  1058. spanLength=spanOneUTF8(spanSet, s+pos, rest);
  1059. if(spanLength>0) {
  1060. if(spanLength==rest) {
  1061. return length; // Reached the end of the string.
  1062. }
  1063. // Match strings after this code point.
  1064. // There cannot be any increments below it because UnicodeSet strings
  1065. // contain multiple code points.
  1066. pos+=spanLength;
  1067. rest-=spanLength;
  1068. offsets.shift(spanLength);
  1069. spanLength=0;
  1070. continue; // Match strings from after a single code point.
  1071. }
  1072. // Match strings from after the next string match.
  1073. }
  1074. }
  1075. int32_t minOffset=offsets.popMinimum();
  1076. pos+=minOffset;
  1077. rest-=minOffset;
  1078. spanLength=0; // Match strings from after a string match.
  1079. }
  1080. }
  1081. int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
  1082. if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  1083. return spanNotBackUTF8(s, length);
  1084. }
  1085. int32_t pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED);
  1086. if(pos==0) {
  1087. return 0;
  1088. }
  1089. int32_t spanLength=length-pos;
  1090. // Consider strings; they may overlap with the span.
  1091. OffsetList offsets;
  1092. if(spanCondition==USET_SPAN_CONTAINED) {
  1093. // Use offset list to try all possibilities.
  1094. offsets.setMaxLength(maxLength8);
  1095. }
  1096. int32_t i, stringsLength=strings.size();
  1097. uint8_t *spanBackUTF8Lengths=spanLengths;
  1098. if(all) {
  1099. spanBackUTF8Lengths+=3*stringsLength;
  1100. }
  1101. for(;;) {
  1102. const uint8_t *s8=utf8;
  1103. int32_t length8;
  1104. if(spanCondition==USET_SPAN_CONTAINED) {
  1105. for(i=0; i<stringsLength; ++i) {
  1106. length8=utf8Lengths[i];
  1107. if(length8==0) {
  1108. continue; // String not representable in UTF-8.
  1109. }
  1110. int32_t overlap=spanBackUTF8Lengths[i];
  1111. if(overlap==ALL_CP_CONTAINED) {
  1112. s8+=length8;
  1113. continue; // Irrelevant string.
  1114. }
  1115. // Try to match this string at pos-(length8-overlap)..pos-length8.
  1116. if(overlap>=LONG_SPAN) {
  1117. overlap=length8;
  1118. // While contained: No point matching fully inside the code point span.
  1119. int32_t len1=0;
  1120. U8_FWD_1(s8, len1, overlap);
  1121. overlap-=len1; // Length of the string minus the first code point.
  1122. }
  1123. if(overlap>spanLength) {
  1124. overlap=spanLength;
  1125. }
  1126. int32_t dec=length8-overlap; // Keep dec+overlap==length8.
  1127. for(;;) {
  1128. if(dec>pos) {
  1129. break;
  1130. }
  1131. // Try to match if the decrement is not listed already.
  1132. // Match at code point boundaries. (The UTF-8 strings were converted
  1133. // from UTF-16 and are guaranteed to be well-formed.)
  1134. if( !U8_IS_TRAIL(s[pos-dec]) &&
  1135. !offsets.containsOffset(dec) &&
  1136. matches8(s+pos-dec, s8, length8)
  1137. ) {
  1138. if(dec==pos) {
  1139. return 0; // Reached the start of the string.
  1140. }
  1141. offsets.addOffset(dec);
  1142. }
  1143. if(overlap==0) {
  1144. break;
  1145. }
  1146. --overlap;
  1147. ++dec;
  1148. }
  1149. s8+=length8;
  1150. }
  1151. } else /* USET_SPAN_SIMPLE */ {
  1152. int32_t maxDec=0, maxOverlap=0;
  1153. for(i=0; i<stringsLength; ++i) {
  1154. length8=utf8Lengths[i];
  1155. if(length8==0) {
  1156. continue; // String not representable in UTF-8.
  1157. }
  1158. int32_t overlap=spanBackUTF8Lengths[i];
  1159. // For longest match, we do need to try to match even an all-contained string
  1160. // to find the match from the latest end.
  1161. // Try to match this string at pos-(length8-overlap)..pos-length8.
  1162. if(overlap>=LONG_SPAN) {
  1163. overlap=length8;
  1164. // Longest match: Need to match fully inside the code point span
  1165. // to find the match from the latest end.
  1166. }
  1167. if(overlap>spanLength) {
  1168. overlap=spanLength;
  1169. }
  1170. int32_t dec=length8-overlap; // Keep dec+overlap==length8.
  1171. for(;;) {
  1172. if(dec>pos || overlap<maxOverlap) {
  1173. break;
  1174. }
  1175. // Try to match if the string is longer or ends later.
  1176. // Match at code point boundaries. (The UTF-8 strings were converted
  1177. // from UTF-16 and are guaranteed to be well-formed.)
  1178. if( !U8_IS_TRAIL(s[pos-dec]) &&
  1179. (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
  1180. matches8(s+pos-dec, s8, length8)
  1181. ) {
  1182. maxDec=dec; // Longest match from latest end.
  1183. maxOverlap=overlap;
  1184. break;
  1185. }
  1186. --overlap;
  1187. ++dec;
  1188. }
  1189. s8+=length8;
  1190. }
  1191. if(maxDec!=0 || maxOverlap!=0) {
  1192. // Longest-match algorithm, and there was a string match.
  1193. // Simply continue before it.
  1194. pos-=maxDec;
  1195. if(pos==0) {
  1196. return 0; // Reached the start of the string.
  1197. }
  1198. spanLength=0; // Match strings from before a string match.
  1199. continue;
  1200. }
  1201. }
  1202. // Finished trying to match all strings at pos.
  1203. if(spanLength!=0 || pos==length) {
  1204. // The position is before an unlimited code point span (spanLength!=0),
  1205. // not before a string match.
  1206. // The only position where spanLength==0 before a span is pos==length.
  1207. // Otherwise, an unlimited code point span is only tried again when no
  1208. // strings match, and if such a non-initial span fails we stop.
  1209. if(offsets.isEmpty()) {
  1210. return pos; // No strings matched before a span.
  1211. }
  1212. // Match strings from before the next string match.
  1213. } else {
  1214. // The position is before a string match (or a single code point).
  1215. if(offsets.isEmpty()) {
  1216. // No more strings matched before a previous string match.
  1217. // Try another code point span from before the last string match.
  1218. int32_t oldPos=pos;
  1219. pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), oldPos, USET_SPAN_CONTAINED);
  1220. spanLength=oldPos-pos;
  1221. if( pos==0 || // Reached the start of the string, or
  1222. spanLength==0 // neither strings nor span progressed.
  1223. ) {
  1224. return pos;
  1225. }
  1226. continue; // spanLength>0: Match strings from before a span.
  1227. } else {
  1228. // Try to match only one code point from before a string match if some
  1229. // string matched beyond it, so that we try all possible positions
  1230. // and don't overshoot.
  1231. spanLength=spanOneBackUTF8(spanSet, s, pos);
  1232. if(spanLength>0) {
  1233. if(spanLength==pos) {
  1234. return 0; // Reached the start of the string.
  1235. }
  1236. // Match strings before this code point.
  1237. // There cannot be any decrements below it because UnicodeSet strings
  1238. // contain multiple code points.
  1239. pos-=spanLength;
  1240. offsets.shift(spanLength);
  1241. spanLength=0;
  1242. continue; // Match strings from before a single code point.
  1243. }
  1244. // Match strings from before the next string match.
  1245. }
  1246. }
  1247. pos-=offsets.popMinimum();
  1248. spanLength=0; // Match strings from before a string match.
  1249. }
  1250. }
  1251. /*
  1252. * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
  1253. *
  1254. * Theoretical algorithm:
  1255. * - Iterate through the string, and at each code point boundary:
  1256. * + If the code point there is in the set, then return with the current position.
  1257. * + If a set string matches at the current position, then return with the current position.
  1258. *
  1259. * Optimized implementation:
  1260. *
  1261. * (Same assumption as for span() above.)
  1262. *
  1263. * Create and cache a spanNotSet which contains all of the single code points
  1264. * of the original set but none of its strings.
  1265. * For each set string add its initial code point to the spanNotSet.
  1266. * (Also add its final code point for spanNotBack().)
  1267. *
  1268. * - Loop:
  1269. * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
  1270. * + If the current code point is in the original set, then
  1271. * return the current position.
  1272. * + If any set string matches at the current position, then
  1273. * return the current position.
  1274. * + If there is no match at the current position, neither for the code point there
  1275. * nor for any set string, then skip this code point and continue the loop.
  1276. * This happens for set-string-initial code points that were added to spanNotSet
  1277. * when there is not actually a match for such a set string.
  1278. */
  1279. int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
  1280. int32_t pos=0, rest=length;
  1281. int32_t i, stringsLength=strings.size();
  1282. do {
  1283. // Span until we find a code point from the set,
  1284. // or a code point that starts or ends some string.
  1285. i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
  1286. if(i==rest) {
  1287. return length; // Reached the end of the string.
  1288. }
  1289. pos+=i;
  1290. rest-=i;
  1291. // Check whether the current code point is in the original set,
  1292. // without the string starts and ends.
  1293. int32_t cpLength=spanOne(spanSet, s+pos, rest);
  1294. if(cpLength>0) {
  1295. return pos; // There is a set element at pos.
  1296. }
  1297. // Try to match the strings at pos.
  1298. for(i=0; i<stringsLength; ++i) {
  1299. if(spanLengths[i]==ALL_CP_CONTAINED) {
  1300. continue; // Irrelevant string. (Also the empty string.)
  1301. }
  1302. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  1303. const char16_t *s16=string.getBuffer();
  1304. int32_t length16=string.length();
  1305. U_ASSERT(length>0);
  1306. if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
  1307. return pos; // There is a set element at pos.
  1308. }
  1309. }
  1310. // The span(while not contained) ended on a string start/end which is
  1311. // not in the original set. Skip this code point and continue.
  1312. // cpLength<0
  1313. pos-=cpLength;
  1314. rest+=cpLength;
  1315. } while(rest!=0);
  1316. return length; // Reached the end of the string.
  1317. }
  1318. int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
  1319. int32_t pos=length;
  1320. int32_t i, stringsLength=strings.size();
  1321. do {
  1322. // Span until we find a code point from the set,
  1323. // or a code point that starts or ends some string.
  1324. pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
  1325. if(pos==0) {
  1326. return 0; // Reached the start of the string.
  1327. }
  1328. // Check whether the current code point is in the original set,
  1329. // without the string starts and ends.
  1330. int32_t cpLength=spanOneBack(spanSet, s, pos);
  1331. if(cpLength>0) {
  1332. return pos; // There is a set element at pos.
  1333. }
  1334. // Try to match the strings at pos.
  1335. for(i=0; i<stringsLength; ++i) {
  1336. // Use spanLengths rather than a spanBackLengths pointer because
  1337. // it is easier and we only need to know whether the string is irrelevant
  1338. // which is the same in either array.
  1339. if(spanLengths[i]==ALL_CP_CONTAINED) {
  1340. continue; // Irrelevant string. (Also the empty string.)
  1341. }
  1342. const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
  1343. const char16_t *s16=string.getBuffer();
  1344. int32_t length16=string.length();
  1345. U_ASSERT(length>0);
  1346. if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
  1347. return pos; // There is a set element at pos.
  1348. }
  1349. }
  1350. // The span(while not contained) ended on a string start/end which is
  1351. // not in the original set. Skip this code point and continue.
  1352. // cpLength<0
  1353. pos+=cpLength;
  1354. } while(pos!=0);
  1355. return 0; // Reached the start of the string.
  1356. }
  1357. int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
  1358. int32_t pos=0, rest=length;
  1359. int32_t i, stringsLength=strings.size();
  1360. uint8_t *spanUTF8Lengths=spanLengths;
  1361. if(all) {
  1362. spanUTF8Lengths+=2*stringsLength;
  1363. }
  1364. do {
  1365. // Span until we find a code point from the set,
  1366. // or a code point that starts or ends some string.
  1367. i = pSpanNotSet->spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_NOT_CONTAINED);
  1368. if(i==rest) {
  1369. return length; // Reached the end of the string.
  1370. }
  1371. pos+=i;
  1372. rest-=i;
  1373. // Check whether the current code point is in the original set,
  1374. // without the string starts and ends.
  1375. int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
  1376. if(cpLength>0) {
  1377. return pos; // There is a set element at pos.
  1378. }
  1379. // Try to match the strings at pos.
  1380. const uint8_t *s8=utf8;
  1381. int32_t length8;
  1382. for(i=0; i<stringsLength; ++i) {
  1383. length8=utf8Lengths[i];
  1384. // ALL_CP_CONTAINED: Irrelevant string.
  1385. if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
  1386. return pos; // There is a set element at pos.
  1387. }
  1388. s8+=length8;
  1389. }
  1390. // The span(while not contained) ended on a string start/end which is
  1391. // not in the original set. Skip this code point and continue.
  1392. // cpLength<0
  1393. pos-=cpLength;
  1394. rest+=cpLength;
  1395. } while(rest!=0);
  1396. return length; // Reached the end of the string.
  1397. }
  1398. int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
  1399. int32_t pos=length;
  1400. int32_t i, stringsLength=strings.size();
  1401. uint8_t *spanBackUTF8Lengths=spanLengths;
  1402. if(all) {
  1403. spanBackUTF8Lengths+=3*stringsLength;
  1404. }
  1405. do {
  1406. // Span until we find a code point from the set,
  1407. // or a code point that starts or ends some string.
  1408. pos = pSpanNotSet->spanBackUTF8(reinterpret_cast<const char*>(s), pos, USET_SPAN_NOT_CONTAINED);
  1409. if(pos==0) {
  1410. return 0; // Reached the start of the string.
  1411. }
  1412. // Check whether the current code point is in the original set,
  1413. // without the string starts and ends.
  1414. int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
  1415. if(cpLength>0) {
  1416. return pos; // There is a set element at pos.
  1417. }
  1418. // Try to match the strings at pos.
  1419. const uint8_t *s8=utf8;
  1420. int32_t length8;
  1421. for(i=0; i<stringsLength; ++i) {
  1422. length8=utf8Lengths[i];
  1423. // ALL_CP_CONTAINED: Irrelevant string.
  1424. if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
  1425. return pos; // There is a set element at pos.
  1426. }
  1427. s8+=length8;
  1428. }
  1429. // The span(while not contained) ended on a string start/end which is
  1430. // not in the original set. Skip this code point and continue.
  1431. // cpLength<0
  1432. pos+=cpLength;
  1433. } while(pos!=0);
  1434. return 0; // Reached the start of the string.
  1435. }
  1436. U_NAMESPACE_END