#include "htmlentity.h"
#include
#include
#include
#include
#include
#include
#include
#include
#define isalpha(c) ('a' <= (c) && (c) <= 'z' || 'A' <= (c) && (c) <= 'Z')
#define isdigit(c) ('0' <= (c) && (c) <= '9')
#define isalnum(c) (isalpha(c) || isdigit(c))
#define TEST_CHAR_AT_IMPL(condition, i, len) ((i < (len)) && (condition(s[i])))
#define TEST_CHAR_AT(condition, i) TEST_CHAR_AT_IMPL(condition, i, len)
static const ui32 UNICODE_BORDER = 0x10FFFF;
enum EPureType {
PT_SEMIC, // Semicolumn shoud always present
PT_HTML5,
PT_HTML5_ATTR
};
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference (anything else comments)
template
static inline bool PureCondition(const char* afterEntityStr, size_t len) {
if (PURE == PT_HTML5)
return true;
const char* s = afterEntityStr;
if (PURE == PT_SEMIC) {
return TEST_CHAR_AT(';' ==, 0);
} else {
return TEST_CHAR_AT(';' ==, 0) || !(TEST_CHAR_AT('=' ==, 1) || TEST_CHAR_AT(isalnum, 1));
}
}
template
inline static bool DetectEntity(const unsigned char* const str, size_t len, TEntity* entity) {
if (len == 0)
return 0;
Y_ASSERT(str[0] == '&');
if (DecodeNamedEntity(str + 1, len - 1, entity)) { // exclude '&'
if (PureCondition((const char*)str + entity->Len, len - entity->Len)) {
entity->Len += 1; // add '&'
Y_ASSERT(entity->Len <= len);
return true;
}
}
return false;
}
static size_t DetectNumber(const char* inputStr, size_t len, wchar32* codepoint) {
if (len < 2)
return 0;
Y_ASSERT(inputStr[0] == '#');
static TCompactStrSpn DIGITS("0123456789");
const char* digitEnd = DIGITS.FindFirstNotOf(inputStr + 1, inputStr + len);
if (digitEnd == inputStr + 1)
return 0;
*codepoint = inputStr[1] - '0';
for (auto sym = inputStr + 2; sym != digitEnd; ++sym) {
if (*codepoint < UNICODE_BORDER)
*codepoint = *codepoint * 10 + (*sym - '0');
}
return digitEnd - inputStr;
}
static size_t DetectXNumber(const char* inputStr, size_t len, wchar32* codepoint) {
if (len < 3)
return 0;
Y_ASSERT(inputStr[0] == '#');
Y_ASSERT(inputStr[1] == 'x' || inputStr[1] == 'X');
static TCompactStrSpn XDIGITS("0123456789ABCDEFabcdef");
const char* digitEnd = XDIGITS.FindFirstNotOf(inputStr + 2, inputStr + len);
if (digitEnd == inputStr + 2)
return 0;
*codepoint = Char2Digit(inputStr[2]);
for (const char* sym = inputStr + 3; sym != digitEnd; ++sym) {
if (*codepoint < UNICODE_BORDER)
*codepoint = *codepoint * 16 + Char2Digit(*sym);
}
return digitEnd - inputStr;
}
///////////////////////////////////////////////////////////////////////////////
static inline void FixBadNumber(wchar32* c) {
if (*c == 0)
*c = BROKEN_RUNE;
if ((0xD800 <= *c && *c <= 0xDFFF) || *c > UNICODE_BORDER) {
*c = BROKEN_RUNE;
}
if (128 <= *c && *c < 160)
*c = CodePageByCharset(CODES_ASCII)->unicode[*c];
// I don't know what does it mean and what the reason.
if (0xF000 <= *c && *c < 0xF100) // UNKNOWN PLANE
*c = '\x20';
}
template
static inline size_t DoNumber(const unsigned char* const s, size_t len, wchar32* c) {
Y_ASSERT(s[0] == '#');
size_t clen = 0;
if (s[1] == 'x' || s[1] == 'X')
clen = DetectXNumber((const char*)s, len, c);
else
clen = DetectNumber((const char*)s, len, c);
if (clen != 0) {
if (!PureCondition((const char*)s + clen, len - clen)) {
return 0;
}
FixBadNumber(c);
return clen + TEST_CHAR_AT(';' ==, clen);
}
return 0;
}
static inline size_t DoSymbol(ECharset cp, const unsigned char* const s, size_t len, wchar32* c) {
size_t written = 0;
size_t clen = 0;
RECODE_RESULT res = RecodeToUnicode(cp, (const char*)s, c, len, 1, clen, written);
bool error = !(res == RECODE_OK || res == RECODE_EOOUTPUT);
if (error || clen == 0)
clen = 1;
if (error || written == 0)
*c = BROKEN_RUNE;
return clen;
}
///////////////////////////////////////////////////////////////////////////////
template
inline bool HtTryDecodeEntityT(const unsigned char* const s, size_t len, TEntity* entity) {
Y_ASSERT(len != 0);
Y_ASSERT(s[0] == '&');
if (len > 2) {
if (isalpha(s[1])) {
return DetectEntity(s, len, entity);
}
if (s[1] == '#') {
entity->Codepoint2 = 0;
entity->Len = DoNumber(s + 1, len - 1, &(entity->Codepoint1));
if (entity->Len != 0) {
entity->Len += 1; // Add '&'
Y_ASSERT(entity->Len <= len);
return true;
}
}
}
return false;
}
template
inline bool HtTryDecodeEntityT(const TStringBuf& str, TEntity* entity) {
return HtTryDecodeEntityT((const unsigned char*)str.data(), str.length(), entity);
}
bool HtTryDecodeEntity(const char* str, size_t len, TEntity* entity) {
return HtTryDecodeEntityT((const unsigned char*)str, len, entity);
}
///////////////////////////////////////////////////////////////////////////////
// the string is in ASCII-compatible encoding, so entities are found as-is
TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc) {
const char* const dstbeg = dst;
const char* const dstend = dstbeg + dstlen;
TStringBuf out;
TStringBuf str(src);
for (size_t curpos = 0, nwr = 0;;) {
const size_t nxtpos = str.find('&', curpos);
const TStringBuf tail = str.SubStr(nxtpos);
if (tail.empty()) {
if (dstbeg == dst) { // we haven't written anything
out = src;
break;
}
if (dst + str.length() <= dstend) { // sufficient space
memmove(dst, str.data(), str.length());
out = TStringBuf(dstbeg, dst - dstbeg + str.length());
}
break;
}
if (dst + nxtpos >= dstend) // insufficient space
break;
TEntity entity;
if (!HtTryDecodeEntityT(tail, &entity)) {
++curpos;
continue;
}
memmove(dst, str.data(), nxtpos);
dst += nxtpos;
if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
break;
dst += nwr;
if (entity.Codepoint2 != 0) {
if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
break;
dst += nwr;
}
str = tail.SubStr(entity.Len);
curpos = 0;
}
return out;
}
// the string is in ASCII-compatible encoding, so entities are found as-is
// however, the target encoding is potentially different
TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc, ECharset cpdst) {
if (cpsrc == cpdst)
return HtTryEntDecodeAsciiCompat(src, dst, dstlen, cpsrc);
const char* const dstbeg = dst;
const char* const dstend = dstbeg + dstlen;
TStringBuf out;
TStringBuf str(src);
for (size_t curpos = 0, nrd, nwr;;) {
const size_t nxtpos = str.find('&', curpos);
const TStringBuf tail = str.SubStr(nxtpos);
if (tail.empty()) {
if (RECODE_OK == Recode(cpsrc, cpdst, str.data(), dst, str.length(), dstend - dst, nrd, nwr))
out = TStringBuf(dstbeg, dst - dstbeg + nwr);
break;
}
TEntity entity;
if (!HtTryDecodeEntityT(tail, &entity)) {
++curpos;
continue;
}
if (RECODE_OK != Recode(cpsrc, cpdst, str.data(), dst, nxtpos, dstend - dst, nrd, nwr))
break;
dst += nwr;
if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint1, dst, dstend - dst, nwr))
break;
dst += nwr;
if (entity.Codepoint2 != 0) {
if (RECODE_OK != RecodeFromUnicode(cpsrc, entity.Codepoint2, dst, dstend - dst, nwr))
break;
dst += nwr;
}
str = tail.SubStr(entity.Len);
curpos = 0;
}
return out;
}
///////////////////////////////////////////////////////////////////////////////
template
inline static std::pair HtEntDecodeStepT(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map, bool old = false) {
if (len == 0)
return std::make_pair(0, 0);
TEntity entity = {0, 0, 0};
if (s[0] == '&') {
if (!HtTryDecodeEntityT(s, len, &entity) || (entity.Codepoint2 != 0 && old)) {
entity.Len = 1;
entity.Codepoint1 = '&';
}
} else {
entity.Len = DoSymbol(cp, s, len, &(entity.Codepoint1));
}
Y_ASSERT(entity.Len <= len);
s += entity.Len;
if (map && *map)
*(*map)++ = (unsigned char)entity.Len;
return std::make_pair(entity.Codepoint1, entity.Codepoint2);
}
std::pair HtEntDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
return HtEntDecodeStepT(cp, str, len, map);
}
std::pair HtEntPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
return HtEntDecodeStepT(cp, str, len, map);
}
wchar32 HtEntOldDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
return HtEntDecodeStepT(cp, str, len, map, true).first;
}
wchar32 HtEntOldPureDecodeStep(ECharset cp, const unsigned char*& str, size_t len, unsigned char** map) {
return HtEntDecodeStepT(cp, str, len, map, true).first;
}
///////////////////////////////////////////////////////////////////////////////
size_t HtEntDecode(ECharset cp, const char* str, size_t len, wchar32* buf, size_t buflen, unsigned char* map) {
const unsigned char* s = (const unsigned char*)str;
const unsigned char* end = (const unsigned char*)(str + len);
size_t ret = 0;
while (s < end & ret < buflen) {
const auto codepoints = HtEntDecodeStep(cp, s, end - s, &map);
*buf++ = codepoints.first;
ret++;
if (codepoints.second != 0 && ret < buflen) {
*buf++ = codepoints.second;
ret++;
}
}
return ret;
}
static const THashSet nonCompliant = {
CODES_UNKNOWNPLANE,
CODES_CP864,
CODES_ISO646_CN,
CODES_ISO646_JP,
CODES_JISX0201,
CODES_TCVN,
CODES_TDS565,
CODES_VISCII};
static bool IsAsciiCompliant(ECharset dc) {
return nonCompliant.count(dc) == 0 && (SingleByteCodepage(dc) || dc == CODES_UTF8);
}
const ui32 LOW_CHAR_COUNT = 0x80;
class TNotRecoded {
public:
bool Flags[LOW_CHAR_COUNT << 1];
bool AsciiCharsets[CODES_MAX];
public:
TNotRecoded() {
memset(&Flags[0], true, LOW_CHAR_COUNT * sizeof(bool));
memset(&Flags[LOW_CHAR_COUNT], false, LOW_CHAR_COUNT * sizeof(bool));
Flags[(ui8)'&'] = false;
Flags[0x7E] = false;
Flags[0x5C] = false;
for (ui32 c = 0; c < CODES_MAX; c++) {
AsciiCharsets[c] = IsAsciiCompliant((ECharset)c);
}
}
bool NotRecoded(unsigned char c) const noexcept {
return Flags[static_cast(c)];
}
bool AsciiComliant(ECharset c) const noexcept {
return (static_cast(c) >= 0) ? AsciiCharsets[c] : false;
}
};
const TNotRecoded NotRecoded;
template
static size_t HtEntDecodeToUtf8T(ECharset cp,
const char* src, size_t srclen,
char* dst, size_t dstlen) {
const unsigned char* srcptr = reinterpret_cast(src);
unsigned char* dstptr = reinterpret_cast(dst);
const unsigned char* const dstbeg = dstptr;
const unsigned char* const srcend = srcptr + srclen;
const unsigned char* const dstend = dstbeg + dstlen;
bool asciiCompl = NotRecoded.AsciiComliant(cp);
for (size_t len = 0; srcptr < srcend;) {
if (asciiCompl && NotRecoded.NotRecoded(*srcptr)) {
if (Y_UNLIKELY(dstptr >= dstend)) {
return 0;
}
*dstptr++ = *srcptr++;
continue;
}
const auto runes = HtEntDecodeStepT(cp, srcptr, srcend - srcptr, nullptr);
if (RECODE_OK != SafeWriteUTF8Char(runes.first, len, dstptr, dstend))
return 0;
dstptr += len;
if (runes.second != 0) {
if (RECODE_OK != SafeWriteUTF8Char(runes.second, len, dstptr, dstend))
return 0;
dstptr += len;
}
}
return dstptr - dstbeg;
}
size_t HtEntDecodeToUtf8(ECharset cp,
const char* src, size_t srclen,
char* dst, size_t dstlen) {
return HtEntDecodeToUtf8T(cp, src, srclen, dst, dstlen);
}
size_t HtDecodeAttrToUtf8(ECharset cp,
const char* src, size_t srclen,
char* dst, size_t dstlen) {
return HtEntDecodeToUtf8T(cp, src, srclen, dst, dstlen);
}
size_t HtEntDecodeToChar(ECharset cp, const char* str, size_t len, wchar16* dst, unsigned char* m) {
const unsigned char* s = reinterpret_cast(str);
const unsigned char* end = reinterpret_cast(str + len);
wchar16* startDst = dst;
bool asciiCompl = NotRecoded.AsciiComliant(cp);
while (s < end) {
if (asciiCompl && NotRecoded.NotRecoded(*s)) {
*dst++ = *s++;
continue;
}
const auto codepoints = HtEntDecodeStep(cp, s, end - s, &m);
const size_t len2 = WriteSymbol(codepoints.first, dst);
if (codepoints.second != 0)
WriteSymbol(codepoints.second, dst);
if (m != nullptr && len2 > 1)
*(m++) = 0;
}
return dst - startDst;
}
bool HtLinkDecode(const char* in, char* out, size_t buflen, size_t& written, ECharset cp) {
return HtLinkDecode(TStringBuf(in, strlen(in)), out, buflen, written, cp);
}
bool HtLinkDecode(const TStringBuf& in, char* out, size_t buflen, size_t& written, ECharset cp) {
static const char XDIGIT[] = "0123456789ABCDEFabcdef";
written = 0;
size_t elen = 0;
const char* inpEnd = in.data() + in.size();
bool asciiCompl = NotRecoded.AsciiComliant(cp);
for (const char* p = in.data(); p < inpEnd && *p; p += elen) {
bool isEntity = false;
wchar32 charval = (unsigned char)*p;
elen = 1;
if (*p == '&') {
TEntity entity;
if (HtTryDecodeEntityT((const unsigned char*)p, inpEnd - p, &entity) && entity.Codepoint2 == 0) {
elen = entity.Len;
charval = entity.Codepoint1;
isEntity = true;
} else {
charval = '&';
elen = 1;
}
}
if (cp != CODES_UNKNOWN && !isEntity) {
if (asciiCompl && NotRecoded.NotRecoded(*p)) {
charval = *p;
} else {
DoSymbol(cp, reinterpret_cast(p), 6, &charval);
if (charval == BROKEN_RUNE)
return false;
}
isEntity = true;
}
if (charval <= 0x20 || charval >= 0x7F) {
if (isEntity && charval >= 0x7F) {
const size_t BUFLEN = 4; // 4 max length of UTF8 encoded character
unsigned char buf[BUFLEN];
size_t len = 0;
if (SafeWriteUTF8Char(charval, len, buf, buf + BUFLEN) != RECODE_OK) // actually always OK
return false;
const size_t n = len * 3;
if (written + n < buflen) {
for (size_t i = 0; i < len; ++i) {
out[written++] = '%';
out[written++] = XDIGIT[buf[i] >> 4];
out[written++] = XDIGIT[buf[i] & 15];
}
} else
return false; // ERROR_SMALL_BUFFER
} else {
if (written + 3 > buflen)
return false; // ERROR_SMALL_BUFFER
unsigned char ch = *p;
if (isEntity) {
ch = charval;
}
out[written++] = '%';
out[written++] = XDIGIT[ch >> 4];
out[written++] = XDIGIT[ch & 15];
}
} else {
if (written + 1 < buflen) {
out[written++] = (unsigned char)charval;
} else {
return false; // ERROR_SMALL_BUFFER
}
}
}
return true;
}