htmlentity.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. #include "htmlentity.h"
  2. #include <util/string/util.h>
  3. #include <util/system/defaults.h>
  4. #include <library/cpp/charset/recyr.hh>
  5. #include <library/cpp/charset/codepage.h>
  6. #include <util/charset/utf8.h>
  7. #include <util/string/strspn.h>
  8. #include <util/string/hex.h>
  9. #include <util/generic/hash_set.h>
  10. #define isalpha(c) ('a' <= (c) && (c) <= 'z' || 'A' <= (c) && (c) <= 'Z')
  11. #define isdigit(c) ('0' <= (c) && (c) <= '9')
  12. #define isalnum(c) (isalpha(c) || isdigit(c))
  13. #define TEST_CHAR_AT_IMPL(condition, i, len) ((i < (len)) && (condition(s[i])))
  14. #define TEST_CHAR_AT(condition, i) TEST_CHAR_AT_IMPL(condition, i, len)
  15. static const ui32 UNICODE_BORDER = 0x10FFFF;
  16. enum EPureType {
  17. PT_SEMIC, // Semicolumn shoud always present
  18. PT_HTML5,
  19. PT_HTML5_ATTR
  20. };
  21. // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference (anything else comments)
  22. template <EPureType PURE>
  23. static inline bool PureCondition(const char* afterEntityStr, size_t len) {
  24. if (PURE == PT_HTML5)
  25. return true;
  26. const char* s = afterEntityStr;
  27. if (PURE == PT_SEMIC) {
  28. return TEST_CHAR_AT(';' ==, 0);
  29. } else {
  30. return TEST_CHAR_AT(';' ==, 0) || !(TEST_CHAR_AT('=' ==, 1) || TEST_CHAR_AT(isalnum, 1));
  31. }
  32. }
  33. template <EPureType PURE>
  34. inline static bool DetectEntity(const unsigned char* const str, size_t len, TEntity* entity) {
  35. if (len == 0)
  36. return 0;
  37. Y_ASSERT(str[0] == '&');
  38. if (DecodeNamedEntity(str + 1, len - 1, entity)) { // exclude '&'
  39. if (PureCondition<PURE>((const char*)str + entity->Len, len - entity->Len)) {
  40. entity->Len += 1; // add '&'
  41. Y_ASSERT(entity->Len <= len);
  42. return true;
  43. }
  44. }
  45. return false;
  46. }
  47. static size_t DetectNumber(const char* inputStr, size_t len, wchar32* codepoint) {
  48. if (len < 2)
  49. return 0;
  50. Y_ASSERT(inputStr[0] == '#');
  51. static TCompactStrSpn DIGITS("0123456789");
  52. const char* digitEnd = DIGITS.FindFirstNotOf<const char*>(inputStr + 1, inputStr + len);
  53. if (digitEnd == inputStr + 1)
  54. return 0;
  55. *codepoint = inputStr[1] - '0';
  56. for (auto sym = inputStr + 2; sym != digitEnd; ++sym) {
  57. if (*codepoint < UNICODE_BORDER)
  58. *codepoint = *codepoint * 10 + (*sym - '0');
  59. }
  60. return digitEnd - inputStr;
  61. }
  62. static size_t DetectXNumber(const char* inputStr, size_t len, wchar32* codepoint) {
  63. if (len < 3)
  64. return 0;
  65. Y_ASSERT(inputStr[0] == '#');
  66. Y_ASSERT(inputStr[1] == 'x' || inputStr[1] == 'X');
  67. static TCompactStrSpn XDIGITS("0123456789ABCDEFabcdef");
  68. const char* digitEnd = XDIGITS.FindFirstNotOf<const char*>(inputStr + 2, inputStr + len);
  69. if (digitEnd == inputStr + 2)
  70. return 0;
  71. *codepoint = Char2Digit(inputStr[2]);
  72. for (const char* sym = inputStr + 3; sym != digitEnd; ++sym) {
  73. if (*codepoint < UNICODE_BORDER)
  74. *codepoint = *codepoint * 16 + Char2Digit(*sym);
  75. }
  76. return digitEnd - inputStr;
  77. }
  78. ///////////////////////////////////////////////////////////////////////////////
  79. static inline void FixBadNumber(wchar32* c) {
  80. if (*c == 0)
  81. *c = BROKEN_RUNE;
  82. if ((0xD800 <= *c && *c <= 0xDFFF) || *c > UNICODE_BORDER) {
  83. *c = BROKEN_RUNE;
  84. }
  85. if (128 <= *c && *c < 160)
  86. *c = CodePageByCharset(CODES_ASCII)->unicode[*c];
  87. // I don't know what does it mean and what the reason.
  88. if (0xF000 <= *c && *c < 0xF100) // UNKNOWN PLANE
  89. *c = '\x20';
  90. }
  91. template <EPureType PURE>
  92. static inline size_t DoNumber(const unsigned char* const s, size_t len, wchar32* c) {
  93. Y_ASSERT(s[0] == '#');
  94. size_t clen = 0;
  95. if (s[1] == 'x' || s[1] == 'X')
  96. clen = DetectXNumber((const char*)s, len, c);
  97. else
  98. clen = DetectNumber((const char*)s, len, c);
  99. if (clen != 0) {
  100. if (!PureCondition<PURE>((const char*)s + clen, len - clen)) {
  101. return 0;
  102. }
  103. FixBadNumber(c);
  104. return clen + TEST_CHAR_AT(';' ==, clen);
  105. }
  106. return 0;
  107. }
  108. static inline size_t DoSymbol(ECharset cp, const unsigned char* const s, size_t len, wchar32* c) {
  109. size_t written = 0;
  110. size_t clen = 0;
  111. RECODE_RESULT res = RecodeToUnicode(cp, (const char*)s, c, len, 1, clen, written);
  112. bool error = !(res == RECODE_OK || res == RECODE_EOOUTPUT);
  113. if (error || clen == 0)
  114. clen = 1;
  115. if (error || written == 0)
  116. *c = BROKEN_RUNE;
  117. return clen;
  118. }
  119. ///////////////////////////////////////////////////////////////////////////////
  120. template <EPureType PURE>
  121. inline bool HtTryDecodeEntityT(const unsigned char* const s, size_t len, TEntity* entity) {
  122. Y_ASSERT(len != 0);
  123. Y_ASSERT(s[0] == '&');
  124. if (len > 2) {
  125. if (isalpha(s[1])) {
  126. return DetectEntity<PURE>(s, len, entity);
  127. }
  128. if (s[1] == '#') {
  129. entity->Codepoint2 = 0;
  130. entity->Len = DoNumber<PURE>(s + 1, len - 1, &(entity->Codepoint1));
  131. if (entity->Len != 0) {
  132. entity->Len += 1; // Add '&'
  133. Y_ASSERT(entity->Len <= len);
  134. return true;
  135. }
  136. }
  137. }
  138. return false;
  139. }
  140. template <EPureType PURE>
  141. inline bool HtTryDecodeEntityT(const TStringBuf& str, TEntity* entity) {
  142. return HtTryDecodeEntityT<PURE>((const unsigned char*)str.data(), str.length(), entity);
  143. }
  144. bool HtTryDecodeEntity(const char* str, size_t len, TEntity* entity) {
  145. return HtTryDecodeEntityT<PT_HTML5>((const unsigned char*)str, len, entity);
  146. }
  147. ///////////////////////////////////////////////////////////////////////////////
  148. // the string is in ASCII-compatible encoding, so entities are found as-is
  149. TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc) {
  150. const char* const dstbeg = dst;
  151. const char* const dstend = dstbeg + dstlen;
  152. TStringBuf out;
  153. TStringBuf str(src);
  154. for (size_t curpos = 0, nwr = 0;;) {
  155. const size_t nxtpos = str.find('&', curpos);
  156. const TStringBuf tail = str.SubStr(nxtpos);
  157. if (tail.empty()) {
  158. if (dstbeg == dst) { // we haven't written anything
  159. out = src;
  160. break;
  161. }
  162. if (dst + str.length() <= dstend) { // sufficient space
  163. memmove(dst, str.data(), str.length());
  164. out = TStringBuf(dstbeg, dst - dstbeg + str.length());
  165. }
  166. break;
  167. }
  168. if (dst + nxtpos >= dstend) // insufficient space
  169. break;
  170. TEntity entity;
  171. if (!HtTryDecodeEntityT<PT_HTML5>(tail, &entity)) {
  172. ++curpos;
  173. continue;
  174. }
  175. memmove(dst, str.data(), nxtpos);
  176. dst += nxtpos;
  177. if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
  178. break;
  179. dst += nwr;
  180. if (entity.Codepoint2 != 0) {
  181. if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
  182. break;
  183. dst += nwr;
  184. }
  185. str = tail.SubStr(entity.Len);
  186. curpos = 0;
  187. }
  188. return out;
  189. }
  190. // the string is in ASCII-compatible encoding, so entities are found as-is
  191. // however, the target encoding is potentially different
  192. TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc, ECharset cpdst) {
  193. if (cpsrc == cpdst)
  194. return HtTryEntDecodeAsciiCompat(src, dst, dstlen, cpsrc);
  195. const char* const dstbeg = dst;
  196. const char* const dstend = dstbeg + dstlen;
  197. TStringBuf out;
  198. TStringBuf str(src);
  199. for (size_t curpos = 0, nrd, nwr;;) {
  200. const size_t nxtpos = str.find('&', curpos);
  201. const TStringBuf tail = str.SubStr(nxtpos);
  202. if (tail.empty()) {
  203. if (RECODE_OK == Recode(cpsrc, cpdst, str.data(), dst, str.length(), dstend - dst, nrd, nwr))
  204. out = TStringBuf(dstbeg, dst - dstbeg + nwr);
  205. break;
  206. }
  207. TEntity entity;
  208. if (!HtTryDecodeEntityT<PT_HTML5>(tail, &entity)) {
  209. ++curpos;
  210. continue;
  211. }
  212. if (RECODE_OK != Recode(cpsrc, cpdst, str.data(), dst, nxtpos, dstend - dst, nrd, nwr))
  213. break;
  214. dst += nwr;
  215. if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
  216. break;
  217. dst += nwr;
  218. if (entity.Codepoint2 != 0) {
  219. if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
  220. break;
  221. dst += nwr;
  222. }
  223. str = tail.SubStr(entity.Len);
  224. curpos = 0;
  225. }
  226. return out;
  227. }
  228. ///////////////////////////////////////////////////////////////////////////////
  229. template <EPureType PURE>
  230. inline static std::pair<wchar32, wchar32> HtEntDecodeStepT(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map, bool old = false) {
  231. if (len == 0)
  232. return std::make_pair(0, 0);
  233. TEntity entity = {0, 0, 0};
  234. if (s[0] == '&') {
  235. if (!HtTryDecodeEntityT<PURE>(s, len, &entity) || (entity.Codepoint2 != 0 && old)) {
  236. entity.Len = 1;
  237. entity.Codepoint1 = '&';
  238. }
  239. } else {
  240. entity.Len = DoSymbol(cp, s, len, &(entity.Codepoint1));
  241. }
  242. Y_ASSERT(entity.Len <= len);
  243. s += entity.Len;
  244. if (map && *map)
  245. *(*map)++ = (unsigned char)entity.Len;
  246. return std::make_pair(entity.Codepoint1, entity.Codepoint2);
  247. }
  248. std::pair<wchar32, wchar32> HtEntDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
  249. return HtEntDecodeStepT<PT_HTML5>(cp, str, len, map);
  250. }
  251. std::pair<wchar32, wchar32> HtEntPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
  252. return HtEntDecodeStepT<PT_SEMIC>(cp, str, len, map);
  253. }
  254. wchar32 HtEntOldDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
  255. return HtEntDecodeStepT<PT_HTML5>(cp, str, len, map, true).first;
  256. }
  257. wchar32 HtEntOldPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
  258. return HtEntDecodeStepT<PT_SEMIC>(cp, str, len, map, true).first;
  259. }
  260. ///////////////////////////////////////////////////////////////////////////////
  261. size_t HtEntDecode(ECharset cp, const char* str, size_t len, wchar32* buf, size_t buflen, unsigned char* map) {
  262. const unsigned char* s = (const unsigned char*)str;
  263. const unsigned char* end = (const unsigned char*)(str + len);
  264. size_t ret = 0;
  265. while (s < end & ret < buflen) {
  266. const auto codepoints = HtEntDecodeStep(cp, s, end - s, &map);
  267. *buf++ = codepoints.first;
  268. ret++;
  269. if (codepoints.second != 0 && ret < buflen) {
  270. *buf++ = codepoints.second;
  271. ret++;
  272. }
  273. }
  274. return ret;
  275. }
  276. static const THashSet<ECharset> nonCompliant = {
  277. CODES_UNKNOWNPLANE,
  278. CODES_CP864,
  279. CODES_ISO646_CN,
  280. CODES_ISO646_JP,
  281. CODES_JISX0201,
  282. CODES_TCVN,
  283. CODES_TDS565,
  284. CODES_VISCII};
  285. static bool IsAsciiCompliant(ECharset dc) {
  286. return nonCompliant.count(dc) == 0 && (SingleByteCodepage(dc) || dc == CODES_UTF8);
  287. }
  288. const ui32 LOW_CHAR_COUNT = 0x80;
  289. class TNotRecoded {
  290. public:
  291. bool Flags[LOW_CHAR_COUNT << 1];
  292. bool AsciiCharsets[CODES_MAX];
  293. public:
  294. TNotRecoded() {
  295. memset(&Flags[0], true, LOW_CHAR_COUNT * sizeof(bool));
  296. memset(&Flags[LOW_CHAR_COUNT], false, LOW_CHAR_COUNT * sizeof(bool));
  297. Flags[(ui8)'&'] = false;
  298. Flags[0x7E] = false;
  299. Flags[0x5C] = false;
  300. for (ui32 c = 0; c < CODES_MAX; c++) {
  301. AsciiCharsets[c] = IsAsciiCompliant((ECharset)c);
  302. }
  303. }
  304. bool NotRecoded(unsigned char c) const noexcept {
  305. return Flags[static_cast<ui8>(c)];
  306. }
  307. bool AsciiComliant(ECharset c) const noexcept {
  308. return (static_cast<int>(c) >= 0) ? AsciiCharsets[c] : false;
  309. }
  310. };
  311. const TNotRecoded NotRecoded;
  312. template <EPureType PURE>
  313. static size_t HtEntDecodeToUtf8T(ECharset cp,
  314. const char* src, size_t srclen,
  315. char* dst, size_t dstlen) {
  316. const unsigned char* srcptr = reinterpret_cast<const unsigned char*>(src);
  317. unsigned char* dstptr = reinterpret_cast<unsigned char*>(dst);
  318. const unsigned char* const dstbeg = dstptr;
  319. const unsigned char* const srcend = srcptr + srclen;
  320. const unsigned char* const dstend = dstbeg + dstlen;
  321. bool asciiCompl = NotRecoded.AsciiComliant(cp);
  322. for (size_t len = 0; srcptr < srcend;) {
  323. if (asciiCompl && NotRecoded.NotRecoded(*srcptr)) {
  324. if (Y_UNLIKELY(dstptr >= dstend)) {
  325. return 0;
  326. }
  327. *dstptr++ = *srcptr++;
  328. continue;
  329. }
  330. const auto runes = HtEntDecodeStepT<PURE>(cp, srcptr, srcend - srcptr, nullptr);
  331. if (RECODE_OK != SafeWriteUTF8Char(runes.first, len, dstptr, dstend))
  332. return 0;
  333. dstptr += len;
  334. if (runes.second != 0) {
  335. if (RECODE_OK != SafeWriteUTF8Char(runes.second, len, dstptr, dstend))
  336. return 0;
  337. dstptr += len;
  338. }
  339. }
  340. return dstptr - dstbeg;
  341. }
  342. size_t HtEntDecodeToUtf8(ECharset cp,
  343. const char* src, size_t srclen,
  344. char* dst, size_t dstlen) {
  345. return HtEntDecodeToUtf8T<PT_HTML5>(cp, src, srclen, dst, dstlen);
  346. }
  347. size_t HtDecodeAttrToUtf8(ECharset cp,
  348. const char* src, size_t srclen,
  349. char* dst, size_t dstlen) {
  350. return HtEntDecodeToUtf8T<PT_HTML5_ATTR>(cp, src, srclen, dst, dstlen);
  351. }
  352. size_t HtEntDecodeToChar(ECharset cp, const char* str, size_t len, wchar16* dst, unsigned char* m) {
  353. const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
  354. const unsigned char* end = reinterpret_cast<const unsigned char*>(str + len);
  355. wchar16* startDst = dst;
  356. bool asciiCompl = NotRecoded.AsciiComliant(cp);
  357. while (s < end) {
  358. if (asciiCompl && NotRecoded.NotRecoded(*s)) {
  359. *dst++ = *s++;
  360. continue;
  361. }
  362. const auto codepoints = HtEntDecodeStep(cp, s, end - s, &m);
  363. const size_t len2 = WriteSymbol(codepoints.first, dst);
  364. if (codepoints.second != 0)
  365. WriteSymbol(codepoints.second, dst);
  366. if (m != nullptr && len2 > 1)
  367. *(m++) = 0;
  368. }
  369. return dst - startDst;
  370. }
  371. bool HtLinkDecode(const char* in, char* out, size_t buflen, size_t& written, ECharset cp) {
  372. return HtLinkDecode(TStringBuf(in, strlen(in)), out, buflen, written, cp);
  373. }
  374. bool HtLinkDecode(const TStringBuf& in, char* out, size_t buflen, size_t& written, ECharset cp) {
  375. static const char XDIGIT[] = "0123456789ABCDEFabcdef";
  376. written = 0;
  377. size_t elen = 0;
  378. const char* inpEnd = in.data() + in.size();
  379. bool asciiCompl = NotRecoded.AsciiComliant(cp);
  380. for (const char* p = in.data(); p < inpEnd && *p; p += elen) {
  381. bool isEntity = false;
  382. wchar32 charval = (unsigned char)*p;
  383. elen = 1;
  384. if (*p == '&') {
  385. TEntity entity;
  386. if (HtTryDecodeEntityT<PT_SEMIC>((const unsigned char*)p, inpEnd - p, &entity) && entity.Codepoint2 == 0) {
  387. elen = entity.Len;
  388. charval = entity.Codepoint1;
  389. isEntity = true;
  390. } else {
  391. charval = '&';
  392. elen = 1;
  393. }
  394. }
  395. if (cp != CODES_UNKNOWN && !isEntity) {
  396. if (asciiCompl && NotRecoded.NotRecoded(*p)) {
  397. charval = *p;
  398. } else {
  399. DoSymbol(cp, reinterpret_cast<const unsigned char*>(p), 6, &charval);
  400. if (charval == BROKEN_RUNE)
  401. return false;
  402. }
  403. isEntity = true;
  404. }
  405. if (charval <= 0x20 || charval >= 0x7F) {
  406. if (isEntity && charval >= 0x7F) {
  407. const size_t BUFLEN = 4; // 4 max length of UTF8 encoded character
  408. unsigned char buf[BUFLEN];
  409. size_t len = 0;
  410. if (SafeWriteUTF8Char(charval, len, buf, buf + BUFLEN) != RECODE_OK) // actually always OK
  411. return false;
  412. const size_t n = len * 3;
  413. if (written + n < buflen) {
  414. for (size_t i = 0; i < len; ++i) {
  415. out[written++] = '%';
  416. out[written++] = XDIGIT[buf[i] >> 4];
  417. out[written++] = XDIGIT[buf[i] & 15];
  418. }
  419. } else
  420. return false; // ERROR_SMALL_BUFFER
  421. } else {
  422. if (written + 3 > buflen)
  423. return false; // ERROR_SMALL_BUFFER
  424. unsigned char ch = *p;
  425. if (isEntity) {
  426. ch = charval;
  427. }
  428. out[written++] = '%';
  429. out[written++] = XDIGIT[ch >> 4];
  430. out[written++] = XDIGIT[ch & 15];
  431. }
  432. } else {
  433. if (written + 1 < buflen) {
  434. out[written++] = (unsigned char)charval;
  435. } else {
  436. return false; // ERROR_SMALL_BUFFER
  437. }
  438. }
  439. }
  440. return true;
  441. }