123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546 |
- #include "htmlentity.h"
- #include <util/string/util.h>
- #include <util/system/defaults.h>
- #include <library/cpp/charset/recyr.hh>
- #include <library/cpp/charset/codepage.h>
- #include <util/charset/utf8.h>
- #include <util/string/strspn.h>
- #include <util/string/hex.h>
- #include <util/generic/hash_set.h>
- #define isalpha(c) ('a' <= (c) && (c) <= 'z' || 'A' <= (c) && (c) <= 'Z')
- #define isdigit(c) ('0' <= (c) && (c) <= '9')
- #define isalnum(c) (isalpha(c) || isdigit(c))
- #define TEST_CHAR_AT_IMPL(condition, i, len) ((i < (len)) && (condition(s[i])))
- #define TEST_CHAR_AT(condition, i) TEST_CHAR_AT_IMPL(condition, i, len)
- static const ui32 UNICODE_BORDER = 0x10FFFF;
- enum EPureType {
- PT_SEMIC, // Semicolumn shoud always present
- PT_HTML5,
- PT_HTML5_ATTR
- };
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference (anything else comments)
- template <EPureType PURE>
- static inline bool PureCondition(const char* afterEntityStr, size_t len) {
- if (PURE == PT_HTML5)
- return true;
- const char* s = afterEntityStr;
- if (PURE == PT_SEMIC) {
- return TEST_CHAR_AT(';' ==, 0);
- } else {
- return TEST_CHAR_AT(';' ==, 0) || !(TEST_CHAR_AT('=' ==, 1) || TEST_CHAR_AT(isalnum, 1));
- }
- }
- template <EPureType PURE>
- inline static bool DetectEntity(const unsigned char* const str, size_t len, TEntity* entity) {
- if (len == 0)
- return 0;
- Y_ASSERT(str[0] == '&');
- if (DecodeNamedEntity(str + 1, len - 1, entity)) { // exclude '&'
- if (PureCondition<PURE>((const char*)str + entity->Len, len - entity->Len)) {
- entity->Len += 1; // add '&'
- Y_ASSERT(entity->Len <= len);
- return true;
- }
- }
- return false;
- }
- static size_t DetectNumber(const char* inputStr, size_t len, wchar32* codepoint) {
- if (len < 2)
- return 0;
- Y_ASSERT(inputStr[0] == '#');
- static TCompactStrSpn DIGITS("0123456789");
- const char* digitEnd = DIGITS.FindFirstNotOf<const char*>(inputStr + 1, inputStr + len);
- if (digitEnd == inputStr + 1)
- return 0;
- *codepoint = inputStr[1] - '0';
- for (auto sym = inputStr + 2; sym != digitEnd; ++sym) {
- if (*codepoint < UNICODE_BORDER)
- *codepoint = *codepoint * 10 + (*sym - '0');
- }
- return digitEnd - inputStr;
- }
- static size_t DetectXNumber(const char* inputStr, size_t len, wchar32* codepoint) {
- if (len < 3)
- return 0;
- Y_ASSERT(inputStr[0] == '#');
- Y_ASSERT(inputStr[1] == 'x' || inputStr[1] == 'X');
- static TCompactStrSpn XDIGITS("0123456789ABCDEFabcdef");
- const char* digitEnd = XDIGITS.FindFirstNotOf<const char*>(inputStr + 2, inputStr + len);
- if (digitEnd == inputStr + 2)
- return 0;
- *codepoint = Char2Digit(inputStr[2]);
- for (const char* sym = inputStr + 3; sym != digitEnd; ++sym) {
- if (*codepoint < UNICODE_BORDER)
- *codepoint = *codepoint * 16 + Char2Digit(*sym);
- }
- return digitEnd - inputStr;
- }
- ///////////////////////////////////////////////////////////////////////////////
- static inline void FixBadNumber(wchar32* c) {
- if (*c == 0)
- *c = BROKEN_RUNE;
- if ((0xD800 <= *c && *c <= 0xDFFF) || *c > UNICODE_BORDER) {
- *c = BROKEN_RUNE;
- }
- if (128 <= *c && *c < 160)
- *c = CodePageByCharset(CODES_ASCII)->unicode[*c];
- // I don't know what does it mean and what the reason.
- if (0xF000 <= *c && *c < 0xF100) // UNKNOWN PLANE
- *c = '\x20';
- }
- template <EPureType PURE>
- static inline size_t DoNumber(const unsigned char* const s, size_t len, wchar32* c) {
- Y_ASSERT(s[0] == '#');
- size_t clen = 0;
- if (s[1] == 'x' || s[1] == 'X')
- clen = DetectXNumber((const char*)s, len, c);
- else
- clen = DetectNumber((const char*)s, len, c);
- if (clen != 0) {
- if (!PureCondition<PURE>((const char*)s + clen, len - clen)) {
- return 0;
- }
- FixBadNumber(c);
- return clen + TEST_CHAR_AT(';' ==, clen);
- }
- return 0;
- }
- static inline size_t DoSymbol(ECharset cp, const unsigned char* const s, size_t len, wchar32* c) {
- size_t written = 0;
- size_t clen = 0;
- RECODE_RESULT res = RecodeToUnicode(cp, (const char*)s, c, len, 1, clen, written);
- bool error = !(res == RECODE_OK || res == RECODE_EOOUTPUT);
- if (error || clen == 0)
- clen = 1;
- if (error || written == 0)
- *c = BROKEN_RUNE;
- return clen;
- }
- ///////////////////////////////////////////////////////////////////////////////
- template <EPureType PURE>
- inline bool HtTryDecodeEntityT(const unsigned char* const s, size_t len, TEntity* entity) {
- Y_ASSERT(len != 0);
- Y_ASSERT(s[0] == '&');
- if (len > 2) {
- if (isalpha(s[1])) {
- return DetectEntity<PURE>(s, len, entity);
- }
- if (s[1] == '#') {
- entity->Codepoint2 = 0;
- entity->Len = DoNumber<PURE>(s + 1, len - 1, &(entity->Codepoint1));
- if (entity->Len != 0) {
- entity->Len += 1; // Add '&'
- Y_ASSERT(entity->Len <= len);
- return true;
- }
- }
- }
- return false;
- }
- template <EPureType PURE>
- inline bool HtTryDecodeEntityT(const TStringBuf& str, TEntity* entity) {
- return HtTryDecodeEntityT<PURE>((const unsigned char*)str.data(), str.length(), entity);
- }
- bool HtTryDecodeEntity(const char* str, size_t len, TEntity* entity) {
- return HtTryDecodeEntityT<PT_HTML5>((const unsigned char*)str, len, entity);
- }
- ///////////////////////////////////////////////////////////////////////////////
- // the string is in ASCII-compatible encoding, so entities are found as-is
- TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc) {
- const char* const dstbeg = dst;
- const char* const dstend = dstbeg + dstlen;
- TStringBuf out;
- TStringBuf str(src);
- for (size_t curpos = 0, nwr = 0;;) {
- const size_t nxtpos = str.find('&', curpos);
- const TStringBuf tail = str.SubStr(nxtpos);
- if (tail.empty()) {
- if (dstbeg == dst) { // we haven't written anything
- out = src;
- break;
- }
- if (dst + str.length() <= dstend) { // sufficient space
- memmove(dst, str.data(), str.length());
- out = TStringBuf(dstbeg, dst - dstbeg + str.length());
- }
- break;
- }
- if (dst + nxtpos >= dstend) // insufficient space
- break;
- TEntity entity;
- if (!HtTryDecodeEntityT<PT_HTML5>(tail, &entity)) {
- ++curpos;
- continue;
- }
- memmove(dst, str.data(), nxtpos);
- dst += nxtpos;
- if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
- break;
- dst += nwr;
- if (entity.Codepoint2 != 0) {
- if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
- break;
- dst += nwr;
- }
- str = tail.SubStr(entity.Len);
- curpos = 0;
- }
- return out;
- }
- // the string is in ASCII-compatible encoding, so entities are found as-is
- // however, the target encoding is potentially different
- TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc, ECharset cpdst) {
- if (cpsrc == cpdst)
- return HtTryEntDecodeAsciiCompat(src, dst, dstlen, cpsrc);
- const char* const dstbeg = dst;
- const char* const dstend = dstbeg + dstlen;
- TStringBuf out;
- TStringBuf str(src);
- for (size_t curpos = 0, nrd, nwr;;) {
- const size_t nxtpos = str.find('&', curpos);
- const TStringBuf tail = str.SubStr(nxtpos);
- if (tail.empty()) {
- if (RECODE_OK == Recode(cpsrc, cpdst, str.data(), dst, str.length(), dstend - dst, nrd, nwr))
- out = TStringBuf(dstbeg, dst - dstbeg + nwr);
- break;
- }
- TEntity entity;
- if (!HtTryDecodeEntityT<PT_HTML5>(tail, &entity)) {
- ++curpos;
- continue;
- }
- if (RECODE_OK != Recode(cpsrc, cpdst, str.data(), dst, nxtpos, dstend - dst, nrd, nwr))
- break;
- dst += nwr;
- if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
- break;
- dst += nwr;
- if (entity.Codepoint2 != 0) {
- if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
- break;
- dst += nwr;
- }
- str = tail.SubStr(entity.Len);
- curpos = 0;
- }
- return out;
- }
- ///////////////////////////////////////////////////////////////////////////////
- template <EPureType PURE>
- inline static std::pair<wchar32, wchar32> HtEntDecodeStepT(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map, bool old = false) {
- if (len == 0)
- return std::make_pair(0, 0);
- TEntity entity = {0, 0, 0};
- if (s[0] == '&') {
- if (!HtTryDecodeEntityT<PURE>(s, len, &entity) || (entity.Codepoint2 != 0 && old)) {
- entity.Len = 1;
- entity.Codepoint1 = '&';
- }
- } else {
- entity.Len = DoSymbol(cp, s, len, &(entity.Codepoint1));
- }
- Y_ASSERT(entity.Len <= len);
- s += entity.Len;
- if (map && *map)
- *(*map)++ = (unsigned char)entity.Len;
- return std::make_pair(entity.Codepoint1, entity.Codepoint2);
- }
- std::pair<wchar32, wchar32> HtEntDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
- return HtEntDecodeStepT<PT_HTML5>(cp, str, len, map);
- }
- std::pair<wchar32, wchar32> HtEntPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
- return HtEntDecodeStepT<PT_SEMIC>(cp, str, len, map);
- }
- wchar32 HtEntOldDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
- return HtEntDecodeStepT<PT_HTML5>(cp, str, len, map, true).first;
- }
- wchar32 HtEntOldPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
- return HtEntDecodeStepT<PT_SEMIC>(cp, str, len, map, true).first;
- }
- ///////////////////////////////////////////////////////////////////////////////
- size_t HtEntDecode(ECharset cp, const char* str, size_t len, wchar32* buf, size_t buflen, unsigned char* map) {
- const unsigned char* s = (const unsigned char*)str;
- const unsigned char* end = (const unsigned char*)(str + len);
- size_t ret = 0;
- while (s < end & ret < buflen) {
- const auto codepoints = HtEntDecodeStep(cp, s, end - s, &map);
- *buf++ = codepoints.first;
- ret++;
- if (codepoints.second != 0 && ret < buflen) {
- *buf++ = codepoints.second;
- ret++;
- }
- }
- return ret;
- }
- static const THashSet<ECharset> nonCompliant = {
- CODES_UNKNOWNPLANE,
- CODES_CP864,
- CODES_ISO646_CN,
- CODES_ISO646_JP,
- CODES_JISX0201,
- CODES_TCVN,
- CODES_TDS565,
- CODES_VISCII};
- static bool IsAsciiCompliant(ECharset dc) {
- return nonCompliant.count(dc) == 0 && (SingleByteCodepage(dc) || dc == CODES_UTF8);
- }
- const ui32 LOW_CHAR_COUNT = 0x80;
- class TNotRecoded {
- public:
- bool Flags[LOW_CHAR_COUNT << 1];
- bool AsciiCharsets[CODES_MAX];
- public:
- TNotRecoded() {
- memset(&Flags[0], true, LOW_CHAR_COUNT * sizeof(bool));
- memset(&Flags[LOW_CHAR_COUNT], false, LOW_CHAR_COUNT * sizeof(bool));
- Flags[(ui8)'&'] = false;
- Flags[0x7E] = false;
- Flags[0x5C] = false;
- for (ui32 c = 0; c < CODES_MAX; c++) {
- AsciiCharsets[c] = IsAsciiCompliant((ECharset)c);
- }
- }
- bool NotRecoded(unsigned char c) const noexcept {
- return Flags[static_cast<ui8>(c)];
- }
- bool AsciiComliant(ECharset c) const noexcept {
- return (static_cast<int>(c) >= 0) ? AsciiCharsets[c] : false;
- }
- };
- const TNotRecoded NotRecoded;
- template <EPureType PURE>
- static size_t HtEntDecodeToUtf8T(ECharset cp,
- const char* src, size_t srclen,
- char* dst, size_t dstlen) {
- const unsigned char* srcptr = reinterpret_cast<const unsigned char*>(src);
- unsigned char* dstptr = reinterpret_cast<unsigned char*>(dst);
- const unsigned char* const dstbeg = dstptr;
- const unsigned char* const srcend = srcptr + srclen;
- const unsigned char* const dstend = dstbeg + dstlen;
- bool asciiCompl = NotRecoded.AsciiComliant(cp);
- for (size_t len = 0; srcptr < srcend;) {
- if (asciiCompl && NotRecoded.NotRecoded(*srcptr)) {
- if (Y_UNLIKELY(dstptr >= dstend)) {
- return 0;
- }
- *dstptr++ = *srcptr++;
- continue;
- }
- const auto runes = HtEntDecodeStepT<PURE>(cp, srcptr, srcend - srcptr, nullptr);
- if (RECODE_OK != SafeWriteUTF8Char(runes.first, len, dstptr, dstend))
- return 0;
- dstptr += len;
- if (runes.second != 0) {
- if (RECODE_OK != SafeWriteUTF8Char(runes.second, len, dstptr, dstend))
- return 0;
- dstptr += len;
- }
- }
- return dstptr - dstbeg;
- }
- size_t HtEntDecodeToUtf8(ECharset cp,
- const char* src, size_t srclen,
- char* dst, size_t dstlen) {
- return HtEntDecodeToUtf8T<PT_HTML5>(cp, src, srclen, dst, dstlen);
- }
- size_t HtDecodeAttrToUtf8(ECharset cp,
- const char* src, size_t srclen,
- char* dst, size_t dstlen) {
- return HtEntDecodeToUtf8T<PT_HTML5_ATTR>(cp, src, srclen, dst, dstlen);
- }
- size_t HtEntDecodeToChar(ECharset cp, const char* str, size_t len, wchar16* dst, unsigned char* m) {
- const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
- const unsigned char* end = reinterpret_cast<const unsigned char*>(str + len);
- wchar16* startDst = dst;
- bool asciiCompl = NotRecoded.AsciiComliant(cp);
- while (s < end) {
- if (asciiCompl && NotRecoded.NotRecoded(*s)) {
- *dst++ = *s++;
- continue;
- }
- const auto codepoints = HtEntDecodeStep(cp, s, end - s, &m);
- const size_t len2 = WriteSymbol(codepoints.first, dst);
- if (codepoints.second != 0)
- WriteSymbol(codepoints.second, dst);
- if (m != nullptr && len2 > 1)
- *(m++) = 0;
- }
- return dst - startDst;
- }
- bool HtLinkDecode(const char* in, char* out, size_t buflen, size_t& written, ECharset cp) {
- return HtLinkDecode(TStringBuf(in, strlen(in)), out, buflen, written, cp);
- }
- bool HtLinkDecode(const TStringBuf& in, char* out, size_t buflen, size_t& written, ECharset cp) {
- static const char XDIGIT[] = "0123456789ABCDEFabcdef";
- written = 0;
- size_t elen = 0;
- const char* inpEnd = in.data() + in.size();
- bool asciiCompl = NotRecoded.AsciiComliant(cp);
- for (const char* p = in.data(); p < inpEnd && *p; p += elen) {
- bool isEntity = false;
- wchar32 charval = (unsigned char)*p;
- elen = 1;
- if (*p == '&') {
- TEntity entity;
- if (HtTryDecodeEntityT<PT_SEMIC>((const unsigned char*)p, inpEnd - p, &entity) && entity.Codepoint2 == 0) {
- elen = entity.Len;
- charval = entity.Codepoint1;
- isEntity = true;
- } else {
- charval = '&';
- elen = 1;
- }
- }
- if (cp != CODES_UNKNOWN && !isEntity) {
- if (asciiCompl && NotRecoded.NotRecoded(*p)) {
- charval = *p;
- } else {
- DoSymbol(cp, reinterpret_cast<const unsigned char*>(p), 6, &charval);
- if (charval == BROKEN_RUNE)
- return false;
- }
- isEntity = true;
- }
- if (charval <= 0x20 || charval >= 0x7F) {
- if (isEntity && charval >= 0x7F) {
- const size_t BUFLEN = 4; // 4 max length of UTF8 encoded character
- unsigned char buf[BUFLEN];
- size_t len = 0;
- if (SafeWriteUTF8Char(charval, len, buf, buf + BUFLEN) != RECODE_OK) // actually always OK
- return false;
- const size_t n = len * 3;
- if (written + n < buflen) {
- for (size_t i = 0; i < len; ++i) {
- out[written++] = '%';
- out[written++] = XDIGIT[buf[i] >> 4];
- out[written++] = XDIGIT[buf[i] & 15];
- }
- } else
- return false; // ERROR_SMALL_BUFFER
- } else {
- if (written + 3 > buflen)
- return false; // ERROR_SMALL_BUFFER
- unsigned char ch = *p;
- if (isEntity) {
- ch = charval;
- }
- out[written++] = '%';
- out[written++] = XDIGIT[ch >> 4];
- out[written++] = XDIGIT[ch & 15];
- }
- } else {
- if (written + 1 < buflen) {
- out[written++] = (unsigned char)charval;
- } else {
- return false; // ERROR_SMALL_BUFFER
- }
- }
- }
- return true;
- }
|