UnicodeNameToCodepoint.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
  2. //-*- C++ -*-===//
  3. //
  4. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  5. // See https://llvm.org/LICENSE.txt for license information.
  6. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements functions to map the name or alias of a unicode
  11. // character to its codepoint.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/ADT/STLExtras.h"
  15. #include "llvm/ADT/StringExtras.h"
  16. #include "llvm/ADT/StringRef.h"
  17. #include "llvm/Support/Unicode.h"
  18. namespace llvm {
  19. namespace sys {
  20. namespace unicode {
  21. extern const char *UnicodeNameToCodepointDict;
  22. extern const uint8_t *UnicodeNameToCodepointIndex;
  23. extern const std::size_t UnicodeNameToCodepointIndexSize;
  24. extern const std::size_t UnicodeNameToCodepointLargestNameSize;
  25. using BufferType = SmallString<64>;
  26. struct Node {
  27. bool IsRoot = false;
  28. char32_t Value = 0xFFFFFFFF;
  29. uint32_t ChildrenOffset = 0;
  30. bool HasSibling = false;
  31. uint32_t Size = 0;
  32. StringRef Name;
  33. const Node *Parent = nullptr;
  34. constexpr bool isValid() const {
  35. return !Name.empty() || Value == 0xFFFFFFFF;
  36. }
  37. constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
  38. std::string fullName() const {
  39. std::string S;
  40. // Reserve enough space for most unicode code points.
  41. // The chosen value represent the 99th percentile of name size as of
  42. // Unicode 15.0.
  43. S.reserve(46);
  44. const Node *N = this;
  45. while (N) {
  46. std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
  47. N = N->Parent;
  48. }
  49. std::reverse(S.begin(), S.end());
  50. return S;
  51. }
  52. };
  53. static Node createRoot() {
  54. Node N;
  55. N.IsRoot = true;
  56. N.ChildrenOffset = 1;
  57. N.Size = 1;
  58. return N;
  59. }
  60. static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
  61. if (Offset == 0)
  62. return createRoot();
  63. uint32_t Origin = Offset;
  64. Node N;
  65. N.Parent = Parent;
  66. uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
  67. if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
  68. return N;
  69. bool LongName = NameInfo & 0x40;
  70. bool HasValue = NameInfo & 0x80;
  71. std::size_t Size = NameInfo & ~0xC0;
  72. if (LongName) {
  73. uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
  74. NameOffset |= UnicodeNameToCodepointIndex[Offset++];
  75. N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
  76. } else {
  77. N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
  78. }
  79. if (HasValue) {
  80. uint8_t H = UnicodeNameToCodepointIndex[Offset++];
  81. uint8_t M = UnicodeNameToCodepointIndex[Offset++];
  82. uint8_t L = UnicodeNameToCodepointIndex[Offset++];
  83. N.Value = ((H << 16) | (M << 8) | L) >> 3;
  84. bool HasChildren = L & 0x02;
  85. N.HasSibling = L & 0x01;
  86. if (HasChildren) {
  87. N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
  88. N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
  89. N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
  90. }
  91. } else {
  92. uint8_t H = UnicodeNameToCodepointIndex[Offset++];
  93. N.HasSibling = H & 0x80;
  94. bool HasChildren = H & 0x40;
  95. H &= uint8_t(~0xC0);
  96. if (HasChildren) {
  97. N.ChildrenOffset = (H << 16);
  98. N.ChildrenOffset |=
  99. (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
  100. N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
  101. }
  102. }
  103. N.Size = Offset - Origin;
  104. return N;
  105. }
  106. static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
  107. std::size_t &Consummed, char &PreviousCharInName,
  108. char &PreviousCharInNeedle, bool IsPrefix = false) {
  109. Consummed = 0;
  110. if (Strict) {
  111. if (!Name.startswith(Needle))
  112. return false;
  113. Consummed = Needle.size();
  114. return true;
  115. }
  116. if (Needle.empty())
  117. return true;
  118. auto NamePos = Name.begin();
  119. auto NeedlePos = Needle.begin();
  120. char PreviousCharInNameOrigin = PreviousCharInName;
  121. char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
  122. auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
  123. bool IgnoreEnd = false) {
  124. while (It != End) {
  125. const auto Next = std::next(It);
  126. // Ignore spaces, underscore, medial hyphens
  127. // https://unicode.org/reports/tr44/#UAX44-LM2.
  128. bool Ignore =
  129. *It == ' ' || *It == '_' ||
  130. (*It == '-' && isAlnum(PreviousChar) &&
  131. ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
  132. PreviousChar = *It;
  133. if (!Ignore)
  134. break;
  135. ++It;
  136. }
  137. return It;
  138. };
  139. while (true) {
  140. NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
  141. NeedlePos =
  142. IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
  143. if (NeedlePos == Needle.end())
  144. break;
  145. if (NamePos == Name.end())
  146. break;
  147. if (toUpper(*NeedlePos) != toUpper(*NamePos))
  148. break;
  149. NeedlePos++;
  150. NamePos++;
  151. }
  152. Consummed = std::distance(Name.begin(), NamePos);
  153. if (NeedlePos != Needle.end()) {
  154. PreviousCharInName = PreviousCharInNameOrigin;
  155. PreviousCharInNeedle = PreviousCharInNeedleOrigin;
  156. }
  157. return NeedlePos == Needle.end();
  158. }
  159. static std::tuple<Node, bool, uint32_t>
  160. compareNode(uint32_t Offset, StringRef Name, bool Strict,
  161. char PreviousCharInName, char PreviousCharInNeedle,
  162. BufferType &Buffer, const Node *Parent = nullptr) {
  163. Node N = readNode(Offset, Parent);
  164. std::size_t Consummed = 0;
  165. bool DoesStartWith =
  166. N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
  167. PreviousCharInName, PreviousCharInNeedle);
  168. if (!DoesStartWith)
  169. return std::make_tuple(N, false, 0);
  170. if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
  171. return std::make_tuple(N, true, N.Value);
  172. if (N.hasChildren()) {
  173. uint32_t ChildOffset = N.ChildrenOffset;
  174. for (;;) {
  175. Node C;
  176. bool Matches;
  177. uint32_t Value;
  178. std::tie(C, Matches, Value) =
  179. compareNode(ChildOffset, Name.substr(Consummed), Strict,
  180. PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
  181. if (Matches) {
  182. std::reverse_copy(C.Name.begin(), C.Name.end(),
  183. std::back_inserter(Buffer));
  184. return std::make_tuple(N, true, Value);
  185. }
  186. ChildOffset += C.Size;
  187. if (!C.HasSibling)
  188. break;
  189. }
  190. }
  191. return std::make_tuple(N, false, 0);
  192. }
  193. static std::tuple<Node, bool, uint32_t>
  194. compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
  195. return compareNode(Offset, Name, Strict, 0, 0, Buffer);
  196. }
  197. // clang-format off
  198. constexpr const char *const HangulSyllables[][3] = {
  199. { "G", "A", "" },
  200. { "GG", "AE", "G" },
  201. { "N", "YA", "GG" },
  202. { "D", "YAE", "GS" },
  203. { "DD", "EO", "N", },
  204. { "R", "E", "NJ" },
  205. { "M", "YEO", "NH" },
  206. { "B", "YE", "D" },
  207. { "BB", "O", "L" },
  208. { "S", "WA", "LG" },
  209. { "SS", "WAE", "LM" },
  210. { "", "OE", "LB" },
  211. { "J", "YO", "LS" },
  212. { "JJ", "U", "LT" },
  213. { "C", "WEO", "LP" },
  214. { "K", "WE", "LH" },
  215. { "T", "WI", "M" },
  216. { "P", "YU", "B" },
  217. { "H", "EU", "BS" },
  218. { 0, "YI", "S" },
  219. { 0, "I", "SS" },
  220. { 0, 0, "NG" },
  221. { 0, 0, "J" },
  222. { 0, 0, "C" },
  223. { 0, 0, "K" },
  224. { 0, 0, "T" },
  225. { 0, 0, "P" },
  226. { 0, 0, "H" }
  227. };
  228. // clang-format on
  229. // Unicode 15.0
  230. // 3.12 Conjoining Jamo Behavior Common constants
  231. constexpr const char32_t SBase = 0xAC00;
  232. constexpr const uint32_t LCount = 19;
  233. constexpr const uint32_t VCount = 21;
  234. constexpr const uint32_t TCount = 28;
  235. static std::size_t findSyllable(StringRef Name, bool Strict,
  236. char &PreviousInName, int &Pos, int Column) {
  237. assert(Column == 0 || Column == 1 || Column == 2);
  238. static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
  239. char NeedleStart = 0;
  240. int Len = -1;
  241. int Prev = PreviousInName;
  242. for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
  243. StringRef Syllable(HangulSyllables[I][Column]);
  244. if (int(Syllable.size()) <= Len)
  245. continue;
  246. std::size_t Consummed = 0;
  247. char PreviousInNameCopy = PreviousInName;
  248. bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
  249. PreviousInNameCopy, NeedleStart);
  250. if (!DoesStartWith)
  251. continue;
  252. Len = Consummed;
  253. Pos = I;
  254. Prev = PreviousInNameCopy;
  255. }
  256. if (Len == -1)
  257. return 0;
  258. PreviousInName = Prev;
  259. return size_t(Len);
  260. }
  261. static std::optional<char32_t>
  262. nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
  263. Buffer.clear();
  264. // Hangul Syllable Decomposition
  265. std::size_t Consummed = 0;
  266. char NameStart = 0, NeedleStart = 0;
  267. bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
  268. NameStart, NeedleStart);
  269. if (!DoesStartWith)
  270. return std::nullopt;
  271. Name = Name.substr(Consummed);
  272. int L = -1, V = -1, T = -1;
  273. Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
  274. Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
  275. Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
  276. if (L != -1 && V != -1 && T != -1 && Name.empty()) {
  277. if (!Strict) {
  278. Buffer.append("HANGUL SYLLABLE ");
  279. if (L != -1)
  280. Buffer.append(HangulSyllables[L][0]);
  281. if (V != -1)
  282. Buffer.append(HangulSyllables[V][1]);
  283. if (T != -1)
  284. Buffer.append(HangulSyllables[T][2]);
  285. }
  286. return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
  287. std::uint32_t(T);
  288. }
  289. // Otherwise, it's an illegal syllable name.
  290. return std::nullopt;
  291. }
  292. struct GeneratedNamesData {
  293. StringRef Prefix;
  294. uint32_t Start;
  295. uint32_t End;
  296. };
  297. // Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
  298. static const GeneratedNamesData GeneratedNamesDataTable[] = {
  299. {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
  300. {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
  301. {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
  302. {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
  303. {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
  304. {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
  305. {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
  306. {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
  307. {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
  308. {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
  309. {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
  310. {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
  311. {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
  312. {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
  313. {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
  314. {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
  315. };
  316. static std::optional<char32_t>
  317. nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
  318. for (auto &&Item : GeneratedNamesDataTable) {
  319. Buffer.clear();
  320. std::size_t Consummed = 0;
  321. char NameStart = 0, NeedleStart = 0;
  322. bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
  323. NameStart, NeedleStart, /*isPrefix*/ true);
  324. if (!DoesStartWith)
  325. continue;
  326. auto Number = Name.substr(Consummed);
  327. unsigned long long V = 0;
  328. // Be consistent about mandating upper casing.
  329. if (Strict &&
  330. llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
  331. return {};
  332. if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
  333. continue;
  334. if (!Strict) {
  335. Buffer.append(Item.Prefix);
  336. Buffer.append(utohexstr(V, true));
  337. }
  338. return V;
  339. }
  340. return std::nullopt;
  341. }
  342. static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
  343. BufferType &Buffer) {
  344. if (Name.empty())
  345. return std::nullopt;
  346. std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
  347. if (!Res)
  348. Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
  349. if (Res)
  350. return *Res;
  351. Buffer.clear();
  352. Node Node;
  353. bool Matches;
  354. uint32_t Value;
  355. std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
  356. if (Matches) {
  357. std::reverse(Buffer.begin(), Buffer.end());
  358. // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
  359. // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
  360. if (!Strict && Value == 0x116c &&
  361. Name.find_insensitive("O-E") != StringRef::npos) {
  362. Buffer = "HANGUL JUNGSEONG O-E";
  363. Value = 0x1180;
  364. }
  365. return Value;
  366. }
  367. return std::nullopt;
  368. }
  369. std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
  370. BufferType Buffer;
  371. auto Opt = nameToCodepoint(Name, true, Buffer);
  372. return Opt;
  373. }
  374. std::optional<LooseMatchingResult>
  375. nameToCodepointLooseMatching(StringRef Name) {
  376. BufferType Buffer;
  377. auto Opt = nameToCodepoint(Name, false, Buffer);
  378. if (!Opt)
  379. return std::nullopt;
  380. return LooseMatchingResult{*Opt, Buffer};
  381. }
  382. // Find the unicode character whose editing distance to Pattern
  383. // is shortest, using the Wagner–Fischer algorithm.
  384. llvm::SmallVector<MatchForCodepointName>
  385. nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
  386. // We maintain a fixed size vector of matches,
  387. // sorted by distance
  388. // The worst match (with the biggest distance) are discarded when new elements
  389. // are added.
  390. std::size_t LargestEditDistance = 0;
  391. llvm::SmallVector<MatchForCodepointName> Matches;
  392. Matches.reserve(MaxMatchesCount + 1);
  393. auto Insert = [&](const Node &Node, uint32_t Distance,
  394. char32_t Value) -> bool {
  395. if (Distance > LargestEditDistance) {
  396. if (Matches.size() == MaxMatchesCount)
  397. return false;
  398. LargestEditDistance = Distance;
  399. }
  400. // To avoid allocations, the creation of the name is delayed
  401. // as much as possible.
  402. std::string Name;
  403. auto GetName = [&] {
  404. if (Name.empty())
  405. Name = Node.fullName();
  406. return Name;
  407. };
  408. auto It = llvm::lower_bound(
  409. Matches, Distance,
  410. [&](const MatchForCodepointName &a, std::size_t Distance) {
  411. if (Distance == a.Distance)
  412. return a.Name < GetName();
  413. return a.Distance < Distance;
  414. });
  415. if (It == Matches.end() && Matches.size() == MaxMatchesCount)
  416. return false;
  417. MatchForCodepointName M{GetName(), Distance, Value};
  418. Matches.insert(It, std::move(M));
  419. if (Matches.size() > MaxMatchesCount)
  420. Matches.pop_back();
  421. return true;
  422. };
  423. // We ignore case, space, hyphens, etc,
  424. // in both the search pattern and the prospective names.
  425. auto Normalize = [](StringRef Name) {
  426. std::string Out;
  427. Out.reserve(Name.size());
  428. for (char C : Name) {
  429. if (isAlnum(C))
  430. Out.push_back(toUpper(C));
  431. }
  432. return Out;
  433. };
  434. std::string NormalizedName = Normalize(Pattern);
  435. // Allocate a matrix big enough for longest names.
  436. const std::size_t Columns =
  437. std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
  438. 1;
  439. LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
  440. UnicodeNameToCodepointLargestNameSize + 1;
  441. std::vector<char> Distances(
  442. Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
  443. auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
  444. assert(Column < Columns);
  445. assert(Row < Rows);
  446. return Distances[Row * Columns + Column];
  447. };
  448. for (std::size_t I = 0; I < Columns; I++)
  449. Get(I, 0) = I;
  450. // Visit the childrens,
  451. // Filling (and overriding) the matrix for the name fragment of each node
  452. // iteratively. CompleteName is used to collect the actual name of potential
  453. // match, respecting case and spacing.
  454. auto VisitNode = [&](const Node &N, std::size_t Row,
  455. auto &VisitNode) -> void {
  456. std::size_t J = 0;
  457. for (; J < N.Name.size(); J++) {
  458. if (!isAlnum(N.Name[J]))
  459. continue;
  460. Get(0, Row) = Row;
  461. for (std::size_t I = 1; I < Columns; I++) {
  462. const int Delete = Get(I - 1, Row) + 1;
  463. const int Insert = Get(I, Row - 1) + 1;
  464. const int Replace =
  465. Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
  466. Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
  467. }
  468. Row++;
  469. }
  470. unsigned Cost = Get(Columns - 1, Row - 1);
  471. if (N.Value != 0xFFFFFFFF) {
  472. Insert(N, Cost, N.Value);
  473. }
  474. if (N.hasChildren()) {
  475. auto ChildOffset = N.ChildrenOffset;
  476. for (;;) {
  477. Node C = readNode(ChildOffset, &N);
  478. ChildOffset += C.Size;
  479. if (!C.isValid())
  480. break;
  481. VisitNode(C, Row, VisitNode);
  482. if (!C.HasSibling)
  483. break;
  484. }
  485. }
  486. };
  487. Node Root = createRoot();
  488. VisitNode(Root, 1, VisitNode);
  489. return Matches;
  490. }
  491. } // namespace unicode
  492. } // namespace sys
  493. } // namespace llvm