unicode_set.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. #include "unicode_set.h"
  2. #include "category_ranges.h"
  3. #include "unicode_set_parser.h"
  4. #include <util/ysaveload.h>
  5. #include <util/charset/wide.h>
  6. #include <util/digest/numeric.h>
  7. #include <util/generic/buffer.h>
  8. #include <util/generic/yexception.h>
  9. #include <util/stream/format.h>
  10. #include <util/stream/input.h>
  11. #include <util/stream/output.h>
  12. #include <util/string/cast.h>
  13. // The original idea of unicode set implementation was taken from the icu::UnicodeSet.
  14. // UnicodeSet has a set of ranges [from, to), where upper boundary is exclusive.
  15. // The list of ranges always has a terminal value CODEPOINT_HIGH at the end.
  16. namespace NUnicode {
  17. namespace NPrivate {
  18. inline wchar32 Bound(wchar32 c) {
  19. return c < TUnicodeSet::CODEPOINT_HIGH ? c : TUnicodeSet::CODEPOINT_HIGH - 1;
  20. }
  21. inline void CheckWcType(WC_TYPE c) {
  22. if (static_cast<size_t>(c) >= CCL_NUM) {
  23. throw yexception() << "Category ID must be less than CCL_NUM (" << static_cast<size_t>(CCL_NUM) << "), specified: " << static_cast<size_t>(c);
  24. }
  25. }
  26. }
  27. // Returns the smallest value i >= from such that 'c' < Ranges[i].
  28. // Some examples:
  29. // GetRangeItem(c, 0)
  30. // set Ranges[] c=0 1 3 4 7 8
  31. // === ============== ===========
  32. // [] [0x110000] 0 0 0 0 0 0
  33. // [:Any:] [0, 0x110000] 1 1 1 1 1 1
  34. // [\u0000-\u0003] [0, 4, 0x110000] 1 1 1 2 2 2
  35. // [\u0004-\u0007] [4, 8, 0x110000] 0 0 0 1 1 2
  36. //
  37. // So, if method returns an odd value then 'c' falls to the {Range[i-1],Range[i]} range.
  38. size_t TUnicodeSet::GetRangeItem(wchar32 c, size_t from) const {
  39. Y_ASSERT(Valid());
  40. Y_ASSERT(from < Length);
  41. if (c < Ranges[from])
  42. return from;
  43. size_t lo = from;
  44. size_t hi = Length - 1;
  45. if (lo >= hi || c >= Ranges[hi - 1]) {
  46. return hi;
  47. }
  48. for (;;) {
  49. size_t i = (lo + hi) >> 1;
  50. if (i == lo) {
  51. break;
  52. } else if (c < Ranges[i]) {
  53. hi = i;
  54. } else {
  55. lo = i;
  56. }
  57. }
  58. return hi;
  59. }
  60. wchar32* TUnicodeSet::EnsureCapacity(size_t capacity) {
  61. if (capacity <= Capacity) {
  62. return const_cast<wchar32*>(Ranges);
  63. }
  64. TDynamicBuffer buf = new wchar32[capacity];
  65. Copy<const wchar32*, wchar32*>(Ranges, Ranges + Length, buf.Get());
  66. DoSwap(buf, DynBuffer);
  67. Ranges = DynBuffer.Get();
  68. Capacity = capacity;
  69. return DynBuffer.Get();
  70. }
  71. wchar32* TUnicodeSet::InsertRangeSlots(const size_t pos, const size_t count) {
  72. Y_ASSERT(pos < Length);
  73. wchar32* src = EnsureCapacity(Length + count) + Length - 1;
  74. wchar32* dst = src + count;
  75. for (size_t i = 0; i < Length - pos; ++i) {
  76. *dst-- = *src--;
  77. }
  78. Length += count;
  79. return src + 1;
  80. }
  81. void TUnicodeSet::EraseRangeSlots(const size_t pos, const size_t count) {
  82. Y_ASSERT(pos < Length);
  83. Y_ASSERT(pos + count <= Length);
  84. wchar32* dst = EnsureWritable() + pos;
  85. wchar32* src = dst + count;
  86. for (size_t i = 0; i < Length - pos - count; ++i) {
  87. *dst++ = *src++;
  88. }
  89. Length -= count;
  90. }
  91. TUnicodeSet::TUnicodeSet()
  92. : Ranges(ShortBuffer)
  93. , Length(0)
  94. , Capacity(Y_ARRAY_SIZE(ShortBuffer))
  95. {
  96. Clear();
  97. }
  98. TUnicodeSet::TUnicodeSet(const TUnicodeSet& s)
  99. : Ranges(ShortBuffer)
  100. , Length(0)
  101. , Capacity(Y_ARRAY_SIZE(ShortBuffer))
  102. {
  103. Set(s);
  104. }
  105. // from, to - inclusive
  106. TUnicodeSet::TUnicodeSet(wchar32 from, wchar32 to)
  107. : Ranges(ShortBuffer)
  108. , Length(0)
  109. , Capacity(Y_ARRAY_SIZE(ShortBuffer))
  110. {
  111. Set(from, to);
  112. }
  113. TUnicodeSet::TUnicodeSet(const TWtringBuf& s)
  114. : Ranges(ShortBuffer)
  115. , Length(0)
  116. , Capacity(Y_ARRAY_SIZE(ShortBuffer))
  117. {
  118. Set(s);
  119. }
  120. TUnicodeSet::TUnicodeSet(WC_TYPE c)
  121. : Ranges(ShortBuffer)
  122. , Length(0)
  123. , Capacity(Y_ARRAY_SIZE(ShortBuffer))
  124. {
  125. Set(c);
  126. }
  127. void TUnicodeSet::AddPredefRanges(const NPrivate::TCategoryRanges& ranges) {
  128. if (ranges.Count > 0) {
  129. for (size_t i = 0; i + 1 < ranges.Count; i += 2) {
  130. Add(ranges.Data[i], ranges.Data[i + 1] - 1);
  131. }
  132. }
  133. }
  134. TUnicodeSet& TUnicodeSet::Add(const TUnicodeSet& s) {
  135. if (Empty()) {
  136. TUnicodeSet::operator=(s);
  137. return *this;
  138. }
  139. for (size_t i = 0; i + 1 < s.Length; i += 2) {
  140. Add(s.Ranges[i], s.Ranges[i + 1] - 1);
  141. }
  142. return *this;
  143. }
  144. TUnicodeSet& TUnicodeSet::Add(const TWtringBuf& s) {
  145. const wchar16* begin = s.data();
  146. const wchar16* end = s.data() + s.size();
  147. while (begin < end) {
  148. Add(ReadSymbolAndAdvance(begin, end));
  149. }
  150. return *this;
  151. }
  152. TUnicodeSet& TUnicodeSet::Add(wchar32 c) {
  153. c = NPrivate::Bound(c);
  154. const size_t i = GetRangeItem(c);
  155. if (i & 1) {
  156. return *this;
  157. }
  158. if (c == Ranges[i] - 1) { // The char adjoins with the next range
  159. if (i > 0 && Ranges[i - 1] == c) { // The char adjoins with the previous range too
  160. if (i + 1 == Length) { // Don't delete the last TERMINAL
  161. EraseRangeSlots(i - 1, 1);
  162. } else {
  163. EraseRangeSlots(i - 1, 2); // Collapse ranges
  164. }
  165. } else {
  166. EnsureWritable()[i] = c;
  167. }
  168. } else if (i > 0 && Ranges[i - 1] == c) {
  169. ++(EnsureWritable()[i - 1]);
  170. } else {
  171. wchar32* target = InsertRangeSlots(i, 2);
  172. *target++ = c;
  173. *target = c + 1;
  174. }
  175. Y_ASSERT(Valid());
  176. return *this;
  177. }
  178. TUnicodeSet& TUnicodeSet::Add(wchar32 from, wchar32 to) {
  179. from = NPrivate::Bound(from);
  180. to = NPrivate::Bound(to);
  181. Y_ASSERT(from <= to);
  182. if (to == from) {
  183. return Add(to);
  184. } else if (from > to) {
  185. return *this;
  186. }
  187. size_t i = GetRangeItem(from);
  188. if (to < Ranges[i]) {
  189. if (i & 1) {
  190. return *this;
  191. }
  192. if (i > 0 && Ranges[i - 1] == from) {
  193. if (Ranges[i] == to + 1) {
  194. if (i + 1 == Length) {
  195. EraseRangeSlots(i - 1, 1);
  196. } else {
  197. EraseRangeSlots(i - 1, 2);
  198. }
  199. } else {
  200. EnsureWritable()[i - 1] = to + 1;
  201. }
  202. } else if (Ranges[i] == to + 1) {
  203. if (i + 1 == Length) {
  204. *InsertRangeSlots(i, 1) = from;
  205. } else {
  206. EnsureWritable()[i] = from;
  207. }
  208. } else {
  209. wchar32* target = InsertRangeSlots(i, 2);
  210. *target++ = from;
  211. *target = to + 1;
  212. }
  213. Y_ASSERT(Valid());
  214. return *this;
  215. }
  216. size_t j = GetRangeItem(to, i);
  217. Y_ASSERT(i < j);
  218. if (0 == (j & 1)) { // 'to' falls between ranges
  219. if (Ranges[j] > to + 1) {
  220. *InsertRangeSlots(j, 1) = to + 1;
  221. } else if (j + 1 < Length) { // Exclude last TERMINAL element
  222. Y_ASSERT(Ranges[j] == to + 1);
  223. // The next range adjoins with the current one. Join them
  224. ++j;
  225. }
  226. }
  227. if (0 == (i & 1)) { // 'from' falls between ranges
  228. if (i > 0 && Ranges[i - 1] == from) {
  229. --i;
  230. } else {
  231. *InsertRangeSlots(i, 1) = from;
  232. ++i;
  233. ++j;
  234. }
  235. }
  236. // Erase ranges, which are covered by the new one
  237. Y_ASSERT(i <= j);
  238. Y_ASSERT(i <= Length);
  239. Y_ASSERT(j <= Length);
  240. EraseRangeSlots(i, j - i);
  241. Y_ASSERT(Valid());
  242. return *this;
  243. }
  244. TUnicodeSet& TUnicodeSet::Add(WC_TYPE c) {
  245. NPrivate::CheckWcType(c);
  246. if (Empty()) {
  247. return Set(c);
  248. }
  249. AddPredefRanges(NPrivate::GetCategoryRanges(c));
  250. return *this;
  251. }
  252. TUnicodeSet& TUnicodeSet::AddCategory(const TStringBuf& catName) {
  253. if (Empty()) {
  254. return SetCategory(catName);
  255. }
  256. AddPredefRanges(NPrivate::GetCategoryRanges(catName));
  257. return *this;
  258. }
  259. void TUnicodeSet::SetPredefRanges(const NPrivate::TCategoryRanges& ranges) {
  260. Clear();
  261. if (ranges.Count > 0) {
  262. DynBuffer.Drop();
  263. Ranges = ranges.Data;
  264. Length = ranges.Count;
  265. Capacity = 0;
  266. }
  267. }
  268. TUnicodeSet& TUnicodeSet::Set(const TUnicodeSet& s) {
  269. if (0 == s.Capacity) {
  270. DynBuffer.Drop();
  271. Ranges = s.Ranges;
  272. Length = s.Length;
  273. Capacity = 0;
  274. } else if (s.Ranges == s.DynBuffer.Get()) {
  275. DynBuffer = s.DynBuffer;
  276. Ranges = DynBuffer.Get();
  277. Length = s.Length;
  278. Capacity = s.Capacity;
  279. } else {
  280. ::Copy(s.Ranges, s.Ranges + s.Length, EnsureCapacity(s.Length));
  281. Length = s.Length;
  282. }
  283. return *this;
  284. }
  285. TUnicodeSet& TUnicodeSet::Set(wchar32 from, wchar32 to) {
  286. from = NPrivate::Bound(from);
  287. to = NPrivate::Bound(to);
  288. Y_ASSERT(from <= to);
  289. Clear();
  290. if (to == from) {
  291. return Add(to);
  292. } else if (from > to) {
  293. return *this;
  294. }
  295. if (to + 1 != CODEPOINT_HIGH) {
  296. wchar32* target = InsertRangeSlots(0, 2);
  297. *target++ = from;
  298. *target = to + 1;
  299. } else {
  300. *InsertRangeSlots(0, 1) = from;
  301. }
  302. Y_ASSERT(Valid());
  303. return *this;
  304. }
  305. TUnicodeSet& TUnicodeSet::Set(const TWtringBuf& s) {
  306. Clear();
  307. return Add(s);
  308. }
  309. TUnicodeSet& TUnicodeSet::Set(WC_TYPE c) {
  310. NPrivate::CheckWcType(c);
  311. SetPredefRanges(NPrivate::GetCategoryRanges(c));
  312. return *this;
  313. }
  314. TUnicodeSet& TUnicodeSet::SetCategory(const TStringBuf& catName) {
  315. SetPredefRanges(NPrivate::GetCategoryRanges(catName));
  316. return *this;
  317. }
  318. TUnicodeSet& TUnicodeSet::Invert() {
  319. Y_ASSERT(Valid());
  320. if (0 == Ranges[0]) {
  321. EraseRangeSlots(0, 1);
  322. } else {
  323. *InsertRangeSlots(0, 1) = 0;
  324. }
  325. return *this;
  326. }
  327. TUnicodeSet& TUnicodeSet::MakeCaseInsensitive() {
  328. TVector<wchar32> oldRanges(Ranges, Ranges + Length);
  329. for (size_t i = 0; i + 1 < oldRanges.size(); i += 2) {
  330. for (wchar32 c = oldRanges[i]; c < oldRanges[i + 1]; ++c) {
  331. const ::NUnicode::NPrivate::TProperty& p = ::NUnicode::NPrivate::CharProperty(c);
  332. if (p.Lower) {
  333. Add(static_cast<wchar32>(c + p.Lower));
  334. }
  335. if (p.Upper) {
  336. Add(static_cast<wchar32>(c + p.Upper));
  337. }
  338. if (p.Title) {
  339. Add(static_cast<wchar32>(c + p.Title));
  340. }
  341. }
  342. }
  343. return *this;
  344. }
  345. TUnicodeSet& TUnicodeSet::Clear() {
  346. if (IsStatic() || IsShared()) {
  347. DynBuffer.Drop();
  348. ShortBuffer[0] = CODEPOINT_HIGH;
  349. Capacity = Y_ARRAY_SIZE(ShortBuffer);
  350. Ranges = ShortBuffer;
  351. } else {
  352. const_cast<wchar32*>(Ranges)[0] = CODEPOINT_HIGH;
  353. }
  354. Length = 1;
  355. return *this;
  356. }
  357. size_t TUnicodeSet::Hash() const {
  358. size_t res = 0;
  359. for (size_t i = 0; i < Length; ++i) {
  360. res = ::CombineHashes(size_t(Ranges[i]), res);
  361. }
  362. return res;
  363. }
  364. inline void WriteUnicodeChar(IOutputStream& out, wchar32 c, bool needEscape = false) {
  365. switch (c) {
  366. case wchar32('-'):
  367. case wchar32('\\'):
  368. case wchar32('^'):
  369. needEscape = true;
  370. break;
  371. default:
  372. break;
  373. }
  374. if (::IsGraph(c) && !needEscape) {
  375. char buf[4]; // Max utf8 char length is 4
  376. size_t wr = 0;
  377. WideToUTF8(&c, 1, buf, wr);
  378. Y_ASSERT(wr <= Y_ARRAY_SIZE(buf));
  379. out.Write(buf, wr);
  380. } else {
  381. TString hexRepr = IntToString<16>(c);
  382. if (c >> 8 == 0) {
  383. out << "\\x" << LeftPad(hexRepr, 2, '0');
  384. } else if (c >> 16 == 0) {
  385. out << "\\u" << LeftPad(hexRepr, 4, '0');
  386. } else {
  387. out << "\\U" << LeftPad(hexRepr, 8, '0');
  388. }
  389. }
  390. }
  391. TString TUnicodeSet::ToString(bool escapeAllChars /* = false*/) const {
  392. Y_ASSERT(Valid());
  393. TStringStream str;
  394. str.Reserve(Length * 4 + Length / 2 + 2);
  395. str.Write('[');
  396. for (size_t i = 0; i + 1 < Length; i += 2) {
  397. WriteUnicodeChar(str, Ranges[i], escapeAllChars);
  398. if (Ranges[i] + 1 < Ranges[i + 1]) {
  399. // Don't write dash for two-symbol ranges
  400. if (Ranges[i] + 2 < Ranges[i + 1]) {
  401. str.Write('-');
  402. }
  403. WriteUnicodeChar(str, Ranges[i + 1] - 1, escapeAllChars);
  404. }
  405. }
  406. str.Write(']');
  407. return str.Str();
  408. }
  409. void TUnicodeSet::Save(IOutputStream* out) const {
  410. ::SaveSize(out, Length);
  411. ::SaveArray(out, Ranges, Length);
  412. }
  413. void TUnicodeSet::Load(IInputStream* in) {
  414. const size_t length = ::LoadSize(in);
  415. if (length > 0) {
  416. ::LoadArray(in, EnsureCapacity(length), length);
  417. }
  418. Length = length;
  419. if (!Valid()) {
  420. ythrow TSerializeException() << "Loaded broken unicode set";
  421. }
  422. }
  423. TUnicodeSet& TUnicodeSet::Parse(const TWtringBuf& data) {
  424. Clear();
  425. NPrivate::ParseUnicodeSet(*this, data);
  426. return *this;
  427. }
  428. }