123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550 |
- //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
- //-*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This file implements functions to map the name or alias of a unicode
- // character to its codepoint.
- //
- //===----------------------------------------------------------------------===//
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/StringExtras.h"
- #include "llvm/ADT/StringRef.h"
- #include "llvm/Support/Unicode.h"
- namespace llvm {
- namespace sys {
- namespace unicode {
- extern const char *UnicodeNameToCodepointDict;
- extern const uint8_t *UnicodeNameToCodepointIndex;
- extern const std::size_t UnicodeNameToCodepointIndexSize;
- extern const std::size_t UnicodeNameToCodepointLargestNameSize;
- using BufferType = SmallString<64>;
- struct Node {
- bool IsRoot = false;
- char32_t Value = 0xFFFFFFFF;
- uint32_t ChildrenOffset = 0;
- bool HasSibling = false;
- uint32_t Size = 0;
- StringRef Name;
- const Node *Parent = nullptr;
- constexpr bool isValid() const {
- return !Name.empty() || Value == 0xFFFFFFFF;
- }
- constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
- std::string fullName() const {
- std::string S;
- // Reserve enough space for most unicode code points.
- // The chosen value represent the 99th percentile of name size as of
- // Unicode 15.0.
- S.reserve(46);
- const Node *N = this;
- while (N) {
- std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
- N = N->Parent;
- }
- std::reverse(S.begin(), S.end());
- return S;
- }
- };
- static Node createRoot() {
- Node N;
- N.IsRoot = true;
- N.ChildrenOffset = 1;
- N.Size = 1;
- return N;
- }
- static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
- if (Offset == 0)
- return createRoot();
- uint32_t Origin = Offset;
- Node N;
- N.Parent = Parent;
- uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
- if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
- return N;
- bool LongName = NameInfo & 0x40;
- bool HasValue = NameInfo & 0x80;
- std::size_t Size = NameInfo & ~0xC0;
- if (LongName) {
- uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
- NameOffset |= UnicodeNameToCodepointIndex[Offset++];
- N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
- } else {
- N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
- }
- if (HasValue) {
- uint8_t H = UnicodeNameToCodepointIndex[Offset++];
- uint8_t M = UnicodeNameToCodepointIndex[Offset++];
- uint8_t L = UnicodeNameToCodepointIndex[Offset++];
- N.Value = ((H << 16) | (M << 8) | L) >> 3;
- bool HasChildren = L & 0x02;
- N.HasSibling = L & 0x01;
- if (HasChildren) {
- N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
- N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
- N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
- }
- } else {
- uint8_t H = UnicodeNameToCodepointIndex[Offset++];
- N.HasSibling = H & 0x80;
- bool HasChildren = H & 0x40;
- H &= uint8_t(~0xC0);
- if (HasChildren) {
- N.ChildrenOffset = (H << 16);
- N.ChildrenOffset |=
- (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
- N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
- }
- }
- N.Size = Offset - Origin;
- return N;
- }
- static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
- std::size_t &Consummed, char &PreviousCharInName,
- char &PreviousCharInNeedle, bool IsPrefix = false) {
- Consummed = 0;
- if (Strict) {
- if (!Name.startswith(Needle))
- return false;
- Consummed = Needle.size();
- return true;
- }
- if (Needle.empty())
- return true;
- auto NamePos = Name.begin();
- auto NeedlePos = Needle.begin();
- char PreviousCharInNameOrigin = PreviousCharInName;
- char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
- auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
- bool IgnoreEnd = false) {
- while (It != End) {
- const auto Next = std::next(It);
- // Ignore spaces, underscore, medial hyphens
- // https://unicode.org/reports/tr44/#UAX44-LM2.
- bool Ignore =
- *It == ' ' || *It == '_' ||
- (*It == '-' && isAlnum(PreviousChar) &&
- ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
- PreviousChar = *It;
- if (!Ignore)
- break;
- ++It;
- }
- return It;
- };
- while (true) {
- NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
- NeedlePos =
- IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
- if (NeedlePos == Needle.end())
- break;
- if (NamePos == Name.end())
- break;
- if (toUpper(*NeedlePos) != toUpper(*NamePos))
- break;
- NeedlePos++;
- NamePos++;
- }
- Consummed = std::distance(Name.begin(), NamePos);
- if (NeedlePos != Needle.end()) {
- PreviousCharInName = PreviousCharInNameOrigin;
- PreviousCharInNeedle = PreviousCharInNeedleOrigin;
- }
- return NeedlePos == Needle.end();
- }
- static std::tuple<Node, bool, uint32_t>
- compareNode(uint32_t Offset, StringRef Name, bool Strict,
- char PreviousCharInName, char PreviousCharInNeedle,
- BufferType &Buffer, const Node *Parent = nullptr) {
- Node N = readNode(Offset, Parent);
- std::size_t Consummed = 0;
- bool DoesStartWith =
- N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
- PreviousCharInName, PreviousCharInNeedle);
- if (!DoesStartWith)
- return std::make_tuple(N, false, 0);
- if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
- return std::make_tuple(N, true, N.Value);
- if (N.hasChildren()) {
- uint32_t ChildOffset = N.ChildrenOffset;
- for (;;) {
- Node C;
- bool Matches;
- uint32_t Value;
- std::tie(C, Matches, Value) =
- compareNode(ChildOffset, Name.substr(Consummed), Strict,
- PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
- if (Matches) {
- std::reverse_copy(C.Name.begin(), C.Name.end(),
- std::back_inserter(Buffer));
- return std::make_tuple(N, true, Value);
- }
- ChildOffset += C.Size;
- if (!C.HasSibling)
- break;
- }
- }
- return std::make_tuple(N, false, 0);
- }
- static std::tuple<Node, bool, uint32_t>
- compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
- return compareNode(Offset, Name, Strict, 0, 0, Buffer);
- }
- // clang-format off
- constexpr const char *const HangulSyllables[][3] = {
- { "G", "A", "" },
- { "GG", "AE", "G" },
- { "N", "YA", "GG" },
- { "D", "YAE", "GS" },
- { "DD", "EO", "N", },
- { "R", "E", "NJ" },
- { "M", "YEO", "NH" },
- { "B", "YE", "D" },
- { "BB", "O", "L" },
- { "S", "WA", "LG" },
- { "SS", "WAE", "LM" },
- { "", "OE", "LB" },
- { "J", "YO", "LS" },
- { "JJ", "U", "LT" },
- { "C", "WEO", "LP" },
- { "K", "WE", "LH" },
- { "T", "WI", "M" },
- { "P", "YU", "B" },
- { "H", "EU", "BS" },
- { 0, "YI", "S" },
- { 0, "I", "SS" },
- { 0, 0, "NG" },
- { 0, 0, "J" },
- { 0, 0, "C" },
- { 0, 0, "K" },
- { 0, 0, "T" },
- { 0, 0, "P" },
- { 0, 0, "H" }
- };
- // clang-format on
- // Unicode 15.0
- // 3.12 Conjoining Jamo Behavior Common constants
- constexpr const char32_t SBase = 0xAC00;
- constexpr const uint32_t LCount = 19;
- constexpr const uint32_t VCount = 21;
- constexpr const uint32_t TCount = 28;
- static std::size_t findSyllable(StringRef Name, bool Strict,
- char &PreviousInName, int &Pos, int Column) {
- assert(Column == 0 || Column == 1 || Column == 2);
- static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
- char NeedleStart = 0;
- int Len = -1;
- int Prev = PreviousInName;
- for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
- StringRef Syllable(HangulSyllables[I][Column]);
- if (int(Syllable.size()) <= Len)
- continue;
- std::size_t Consummed = 0;
- char PreviousInNameCopy = PreviousInName;
- bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
- PreviousInNameCopy, NeedleStart);
- if (!DoesStartWith)
- continue;
- Len = Consummed;
- Pos = I;
- Prev = PreviousInNameCopy;
- }
- if (Len == -1)
- return 0;
- PreviousInName = Prev;
- return size_t(Len);
- }
- static std::optional<char32_t>
- nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
- Buffer.clear();
- // Hangul Syllable Decomposition
- std::size_t Consummed = 0;
- char NameStart = 0, NeedleStart = 0;
- bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
- NameStart, NeedleStart);
- if (!DoesStartWith)
- return std::nullopt;
- Name = Name.substr(Consummed);
- int L = -1, V = -1, T = -1;
- Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
- Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
- Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
- if (L != -1 && V != -1 && T != -1 && Name.empty()) {
- if (!Strict) {
- Buffer.append("HANGUL SYLLABLE ");
- if (L != -1)
- Buffer.append(HangulSyllables[L][0]);
- if (V != -1)
- Buffer.append(HangulSyllables[V][1]);
- if (T != -1)
- Buffer.append(HangulSyllables[T][2]);
- }
- return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
- std::uint32_t(T);
- }
- // Otherwise, it's an illegal syllable name.
- return std::nullopt;
- }
- struct GeneratedNamesData {
- StringRef Prefix;
- uint32_t Start;
- uint32_t End;
- };
- // Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings
- static const GeneratedNamesData GeneratedNamesDataTable[] = {
- {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
- {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
- {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
- {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
- {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
- {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
- {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
- {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
- {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
- {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
- {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
- {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
- {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
- {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
- {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
- {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
- };
- static std::optional<char32_t>
- nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
- for (auto &&Item : GeneratedNamesDataTable) {
- Buffer.clear();
- std::size_t Consummed = 0;
- char NameStart = 0, NeedleStart = 0;
- bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
- NameStart, NeedleStart, /*isPrefix*/ true);
- if (!DoesStartWith)
- continue;
- auto Number = Name.substr(Consummed);
- unsigned long long V = 0;
- // Be consistent about mandating upper casing.
- if (Strict &&
- llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
- return {};
- if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
- continue;
- if (!Strict) {
- Buffer.append(Item.Prefix);
- Buffer.append(utohexstr(V, true));
- }
- return V;
- }
- return std::nullopt;
- }
- static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
- BufferType &Buffer) {
- if (Name.empty())
- return std::nullopt;
- std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
- if (!Res)
- Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
- if (Res)
- return *Res;
- Buffer.clear();
- Node Node;
- bool Matches;
- uint32_t Value;
- std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
- if (Matches) {
- std::reverse(Buffer.begin(), Buffer.end());
- // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
- // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
- if (!Strict && Value == 0x116c &&
- Name.find_insensitive("O-E") != StringRef::npos) {
- Buffer = "HANGUL JUNGSEONG O-E";
- Value = 0x1180;
- }
- return Value;
- }
- return std::nullopt;
- }
- std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
- BufferType Buffer;
- auto Opt = nameToCodepoint(Name, true, Buffer);
- return Opt;
- }
- std::optional<LooseMatchingResult>
- nameToCodepointLooseMatching(StringRef Name) {
- BufferType Buffer;
- auto Opt = nameToCodepoint(Name, false, Buffer);
- if (!Opt)
- return std::nullopt;
- return LooseMatchingResult{*Opt, Buffer};
- }
- // Find the unicode character whose editing distance to Pattern
- // is shortest, using the Wagner–Fischer algorithm.
- llvm::SmallVector<MatchForCodepointName>
- nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
- // We maintain a fixed size vector of matches,
- // sorted by distance
- // The worst match (with the biggest distance) are discarded when new elements
- // are added.
- std::size_t LargestEditDistance = 0;
- llvm::SmallVector<MatchForCodepointName> Matches;
- Matches.reserve(MaxMatchesCount + 1);
- auto Insert = [&](const Node &Node, uint32_t Distance,
- char32_t Value) -> bool {
- if (Distance > LargestEditDistance) {
- if (Matches.size() == MaxMatchesCount)
- return false;
- LargestEditDistance = Distance;
- }
- // To avoid allocations, the creation of the name is delayed
- // as much as possible.
- std::string Name;
- auto GetName = [&] {
- if (Name.empty())
- Name = Node.fullName();
- return Name;
- };
- auto It = llvm::lower_bound(
- Matches, Distance,
- [&](const MatchForCodepointName &a, std::size_t Distance) {
- if (Distance == a.Distance)
- return a.Name < GetName();
- return a.Distance < Distance;
- });
- if (It == Matches.end() && Matches.size() == MaxMatchesCount)
- return false;
- MatchForCodepointName M{GetName(), Distance, Value};
- Matches.insert(It, std::move(M));
- if (Matches.size() > MaxMatchesCount)
- Matches.pop_back();
- return true;
- };
- // We ignore case, space, hyphens, etc,
- // in both the search pattern and the prospective names.
- auto Normalize = [](StringRef Name) {
- std::string Out;
- Out.reserve(Name.size());
- for (char C : Name) {
- if (isAlnum(C))
- Out.push_back(toUpper(C));
- }
- return Out;
- };
- std::string NormalizedName = Normalize(Pattern);
- // Allocate a matrix big enough for longest names.
- const std::size_t Columns =
- std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
- 1;
- LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
- UnicodeNameToCodepointLargestNameSize + 1;
- std::vector<char> Distances(
- Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
- auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
- assert(Column < Columns);
- assert(Row < Rows);
- return Distances[Row * Columns + Column];
- };
- for (std::size_t I = 0; I < Columns; I++)
- Get(I, 0) = I;
- // Visit the childrens,
- // Filling (and overriding) the matrix for the name fragment of each node
- // iteratively. CompleteName is used to collect the actual name of potential
- // match, respecting case and spacing.
- auto VisitNode = [&](const Node &N, std::size_t Row,
- auto &VisitNode) -> void {
- std::size_t J = 0;
- for (; J < N.Name.size(); J++) {
- if (!isAlnum(N.Name[J]))
- continue;
- Get(0, Row) = Row;
- for (std::size_t I = 1; I < Columns; I++) {
- const int Delete = Get(I - 1, Row) + 1;
- const int Insert = Get(I, Row - 1) + 1;
- const int Replace =
- Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
- Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
- }
- Row++;
- }
- unsigned Cost = Get(Columns - 1, Row - 1);
- if (N.Value != 0xFFFFFFFF) {
- Insert(N, Cost, N.Value);
- }
- if (N.hasChildren()) {
- auto ChildOffset = N.ChildrenOffset;
- for (;;) {
- Node C = readNode(ChildOffset, &N);
- ChildOffset += C.Size;
- if (!C.isValid())
- break;
- VisitNode(C, Row, VisitNode);
- if (!C.HasSibling)
- break;
- }
- }
- };
- Node Root = createRoot();
- VisitNode(Root, 1, VisitNode);
- return Matches;
- }
- } // namespace unicode
- } // namespace sys
- } // namespace llvm
|