normalization.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #pragma once
  2. #include "decomposition_table.h"
  3. #include <util/charset/unidata.h>
  4. #include <util/charset/wide.h>
  5. #include <util/generic/hash.h>
  6. #include <util/generic/vector.h>
  7. #include <util/generic/algorithm.h>
  8. #include <util/generic/singleton.h>
  9. #include <util/generic/noncopyable.h>
  10. #include <utility>
  11. namespace NUnicode {
  12. enum ENormalization {
  13. NFD,
  14. NFC,
  15. NFKD,
  16. NFKC,
  17. };
  18. // Грубо говоря:
  19. // NFD расскладывает "ё" на "е + диакритику"
  20. // NFC сначала всё раскладывает, потом всё что может - складывает
  21. // NFKD делает то же, что и NFD. Кроме того, например, римскую IV (\x2163)
  22. // превращает в латинские I и V
  23. // NFKC - NFKD + композиция (римская четвёрка из I и V, естественно, не образуется)
  24. // Формальная спецификация: http://www.unicode.org/reports/tr15/
  25. namespace NPrivate {
  26. inline const wchar32* Decomposition(const TDecompositionTable& table, wchar32 ch) {
  27. return table.Get(ch, static_cast<const wchar32*>(nullptr));
  28. }
  29. class TDecompositor {
  30. private:
  31. const TDecompositionTable& Table;
  32. public:
  33. inline TDecompositor(const TDecompositionTable& table)
  34. : Table(table)
  35. {
  36. }
  37. inline const wchar32* Decomposition(wchar32 ch) const {
  38. return NPrivate::Decomposition(Table, ch);
  39. }
  40. };
  41. template <bool IsCompat>
  42. struct TStandartDecompositor: public TDecompositor {
  43. TStandartDecompositor()
  44. : TDecompositor(NPrivate::DecompositionTable<IsCompat>())
  45. {
  46. }
  47. };
  48. template <ENormalization N>
  49. struct TShift;
  50. template <>
  51. struct TShift<NFD> {
  52. static const WC_TYPE Value = NFD_QC;
  53. };
  54. template <>
  55. struct TShift<NFC> {
  56. static const WC_TYPE Value = NFC_QC;
  57. };
  58. template <>
  59. struct TShift<NFKD> {
  60. static const WC_TYPE Value = NFKD_QC;
  61. };
  62. template <>
  63. struct TShift<NFKC> {
  64. static const WC_TYPE Value = NFKC_QC;
  65. };
  66. template <ENormalization N>
  67. inline bool Normalized(wchar32 ch) {
  68. return CharInfo(ch) & NPrivate::TShift<N>::Value;
  69. }
  70. class TComposition {
  71. private:
  72. struct TRawData {
  73. wchar32 Lead;
  74. wchar32 Tail;
  75. wchar32 Comp;
  76. };
  77. static const TRawData RawData[];
  78. static const size_t RawDataSize;
  79. class TKey: public std::pair<wchar32, wchar32> {
  80. public:
  81. inline TKey(wchar32 a, wchar32 b)
  82. : std::pair<wchar32, wchar32>(a, b)
  83. {
  84. }
  85. inline size_t Hash() const {
  86. return CombineHashes(first, second);
  87. }
  88. };
  89. template <class T>
  90. struct THash {
  91. inline size_t operator()(const T& t) const {
  92. return t.Hash();
  93. }
  94. };
  95. typedef THashMap<TKey, wchar32, THash<TKey>> TData;
  96. TData Data;
  97. public:
  98. TComposition();
  99. inline wchar32 Composite(wchar32 lead, wchar32 tail) const {
  100. TData::const_iterator i = Data.find(TKey(lead, tail));
  101. if (i == Data.end())
  102. return 0;
  103. return i->second;
  104. }
  105. };
  106. typedef std::pair<wchar32, TCombining> TSymbol;
  107. typedef TVector<TSymbol> TBuffer;
  108. template <bool doCompose>
  109. class TCompositor;
  110. template <>
  111. class TCompositor<false> {
  112. public:
  113. inline void DoComposition(TBuffer& buffer) {
  114. Y_UNUSED(buffer);
  115. }
  116. };
  117. template <>
  118. class TCompositor<true> {
  119. private:
  120. static const wchar32 NonComposite = 0;
  121. const TComposition* Composition;
  122. public:
  123. inline TCompositor()
  124. : Composition(Singleton<TComposition>())
  125. {
  126. }
  127. inline void DoComposition(TBuffer& buffer) {
  128. if (buffer.size() < 2)
  129. return;
  130. const TSymbol& leadSymbol = buffer[0];
  131. if (leadSymbol.second != 0)
  132. return;
  133. wchar32 lead = leadSymbol.first;
  134. bool oneMoreTurnPlease = false;
  135. do {
  136. oneMoreTurnPlease = false;
  137. TCombining lastCombining = 0;
  138. for (TBuffer::iterator i = buffer.begin() + 1, mi = buffer.end(); i != mi; ++i) {
  139. TCombining currentCombining = i->second;
  140. if (!(currentCombining != lastCombining && currentCombining != 0 || lastCombining == 0 && currentCombining == 0))
  141. continue;
  142. lastCombining = currentCombining;
  143. wchar32 comb = Composition->Composite(lead, i->first);
  144. if (comb == NonComposite)
  145. continue;
  146. lead = comb;
  147. buffer.erase(i);
  148. oneMoreTurnPlease = true;
  149. break;
  150. }
  151. } while (oneMoreTurnPlease);
  152. Y_ASSERT(DecompositionCombining(lead) == 0);
  153. buffer[0] = TSymbol(lead, 0);
  154. }
  155. };
  156. template <ENormalization N, typename TCharType>
  157. inline bool Normalized(const TCharType* begin, const TCharType* end) {
  158. TCombining lastCanonicalClass = 0;
  159. for (const TCharType* i = begin; i != end;) {
  160. wchar32 ch = ReadSymbolAndAdvance(i, end);
  161. TCombining canonicalClass = DecompositionCombining(ch);
  162. if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
  163. return false;
  164. if (!Normalized<N>(ch))
  165. return false;
  166. lastCanonicalClass = canonicalClass;
  167. }
  168. return true;
  169. }
  170. }
  171. template <bool compat>
  172. inline const wchar32* Decomposition(wchar32 ch) {
  173. return NPrivate::Decomposition(NPrivate::DecompositionTable<compat>(), ch);
  174. }
  175. template <ENormalization N, class TDecompositor = NPrivate::TDecompositor>
  176. class TNormalizer : NNonCopyable::TNonCopyable {
  177. private:
  178. static const ENormalization Norm = N;
  179. static const bool IsCompat = Norm == NFKD || Norm == NFKC;
  180. static const bool RequireComposition = Norm == NFC || Norm == NFKC;
  181. typedef NPrivate::TSymbol TSymbol;
  182. typedef NPrivate::TBuffer TBuffer;
  183. TBuffer Buffer;
  184. NPrivate::TCompositor<RequireComposition> Compositor;
  185. const TDecompositor& Decompositor;
  186. private:
  187. static inline bool Compare(const TSymbol& a, const TSymbol& b) {
  188. return a.second < b.second;
  189. }
  190. struct TComparer {
  191. inline bool operator()(const TSymbol& a, const TSymbol& b) {
  192. return Compare(a, b);
  193. }
  194. };
  195. template <class T>
  196. static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, T& out) {
  197. for (TBuffer::const_iterator i = begin; i != end; ++i) {
  198. WriteSymbol(i->first, out);
  199. }
  200. }
  201. static inline void Write(const TBuffer::const_iterator& begin, const TBuffer::const_iterator& end, TUtf32String& out) { // because WriteSymbol from util/charset/wide.h works wrong in this case
  202. for (TBuffer::const_iterator i = begin; i != end; ++i) {
  203. out += i->first;
  204. }
  205. }
  206. inline void SortBuffer() {
  207. if (Buffer.size() < 2)
  208. return;
  209. StableSort(Buffer.begin(), Buffer.end(), TComparer());
  210. }
  211. template <class T>
  212. inline void AddCharNoDecomposition(wchar32 c, T& out) {
  213. TCombining cc = DecompositionCombining(c);
  214. if (cc == 0) {
  215. SortBuffer();
  216. Buffer.push_back(TBuffer::value_type(c, cc));
  217. Compositor.DoComposition(Buffer);
  218. if (Buffer.size() > 1) {
  219. Write(Buffer.begin(), Buffer.end() - 1, out);
  220. Buffer.erase(Buffer.begin(), Buffer.end() - 1); // TODO I don't like this
  221. }
  222. } else {
  223. Buffer.push_back(TBuffer::value_type(c, cc));
  224. }
  225. }
  226. template <class T>
  227. inline void AddChar(wchar32 c, T& out) {
  228. const wchar32* decompBegin = Decompositor.Decomposition(c);
  229. if (decompBegin) {
  230. while (*decompBegin) {
  231. Y_ASSERT(Decompositor.Decomposition(*decompBegin) == nullptr);
  232. AddCharNoDecomposition(*(decompBegin++), out);
  233. }
  234. return;
  235. } else {
  236. AddCharNoDecomposition(c, out);
  237. }
  238. }
  239. template <class T, typename TCharType>
  240. inline void DoNormalize(const TCharType* begin, const TCharType* end, T& out) {
  241. Buffer.clear();
  242. for (const TCharType* i = begin; i != end;) {
  243. AddChar(ReadSymbolAndAdvance(i, end), out);
  244. }
  245. SortBuffer();
  246. Compositor.DoComposition(Buffer);
  247. Write(Buffer.begin(), Buffer.end(), out);
  248. }
  249. public:
  250. TNormalizer()
  251. : Decompositor(*Singleton<NPrivate::TStandartDecompositor<IsCompat>>())
  252. {
  253. }
  254. TNormalizer(const TDecompositor& decompositor)
  255. : Decompositor(decompositor)
  256. {
  257. }
  258. template <class T, typename TCharType>
  259. inline void Normalize(const TCharType* begin, const TCharType* end, T& out) {
  260. if (NPrivate::Normalized<Norm>(begin, end)) {
  261. for (const TCharType* i = begin; i != end; ++i) {
  262. WriteSymbol(*i, out);
  263. }
  264. } else {
  265. DoNormalize(begin, end, out);
  266. }
  267. }
  268. template <typename TCharType>
  269. inline void Normalize(const TCharType* begin, const TCharType* end, TUtf32String& out) {
  270. if (NPrivate::Normalized<Norm>(begin, end)) {
  271. for (const TCharType* i = begin; i != end;) {
  272. out += ReadSymbolAndAdvance(i, end);
  273. }
  274. } else {
  275. DoNormalize(begin, end, out);
  276. }
  277. }
  278. template <class T, typename TCharType>
  279. inline void Normalize(const TCharType* begin, size_t len, T& out) {
  280. return Normalize(begin, begin + len, out);
  281. }
  282. template <typename TCharType>
  283. inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& src) {
  284. if (NPrivate::Normalized<Norm>(src.begin(), src.end())) {
  285. // nothing to normalize
  286. return src;
  287. } else {
  288. TBasicString<TCharType> res;
  289. res.reserve(src.length());
  290. DoNormalize(src.begin(), src.end(), res);
  291. return res;
  292. }
  293. }
  294. };
  295. }
  296. //! decompose utf16 or utf32 string to any container supporting push_back or to T*
  297. template <NUnicode::ENormalization Norm, class T, typename TCharType>
  298. inline void Normalize(const TCharType* begin, size_t len, T& out) {
  299. ::NUnicode::TNormalizer<Norm> dec;
  300. dec.Normalize(begin, len, out);
  301. }
  302. template <NUnicode::ENormalization N, typename TCharType>
  303. inline TBasicString<TCharType> Normalize(const TCharType* str, size_t len) {
  304. TBasicString<TCharType> res;
  305. res.reserve(len);
  306. Normalize<N>(str, len, res);
  307. return res;
  308. }
  309. template <NUnicode::ENormalization N, typename TCharType>
  310. inline TBasicString<TCharType> Normalize(const TBasicString<TCharType>& str) {
  311. ::NUnicode::TNormalizer<N> dec;
  312. return dec.Normalize(str);
  313. }
  314. template <NUnicode::ENormalization N, typename TCharType>
  315. inline TBasicString<TCharType> Normalize(const TBasicStringBuf<TCharType> str) {
  316. return Normalize<N>(str.data(), str.size());
  317. }