htmlentity.h 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #pragma once
  2. #include "decoder.h"
  3. #include <util/system/defaults.h>
  4. #include <library/cpp/charset/doccodes.h>
  5. #include <util/generic/strbuf.h>
  6. #include <utility>
  7. /******************************************************/
  8. /* direct decoding actions */
  9. /******************************************************/
  10. //! Try decode named or numeric entity using general html5 standard rules.
  11. //! @param str - string started with '&'.
  12. bool HtTryDecodeEntity(const char* str, size_t len, TEntity* entity);
  13. /******************************************************/
  14. /* step by step actions */
  15. /******************************************************/
  16. // NOTE: Some entities have two codepoinst, if entity has one codepoint
  17. // then the second wchar32 in pair is zero.
  18. // Decodes with html5 standard rules.
  19. std::pair<wchar32, wchar32> HtEntDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map);
  20. // Decodes assuming that ';' should always present after entity.
  21. std::pair<wchar32, wchar32> HtEntPureDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map);
  22. // Similar with HtEntDecodeStep, but do not decodes named entities with two codepoints.
  23. // Use HtEntDecodeStep and HtEntPureDecodeStep instead.
  24. wchar32 HtEntOldDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map);
  25. wchar32 HtEntOldPureDecodeStep(ECharset cp, const unsigned char*& s, size_t len, unsigned char** map);
  26. /******************************************************/
  27. /* complete actions */
  28. /******************************************************/
  29. // Try decode str using general html5 standard rules.
  30. // Stops when str or buffer finish.
  31. size_t HtEntDecode(ECharset cp, const char* str, size_t len, wchar32* buffer, size_t buflen, unsigned char* char_lengthes = nullptr);
  32. size_t HtEntDecodeToUtf8(ECharset cp, const char* src, size_t srclen, char* dst, size_t dstlen);
  33. // Special rules for attributes decoding
  34. // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#character-reference-in-attribute-value-state
  35. size_t HtDecodeAttrToUtf8(ECharset cp, const char* src, size_t srclen, char* dst, size_t dstlen);
  36. size_t HtEntDecodeToChar(ECharset cp, const char* str, size_t len, wchar16* buffer, unsigned char* char_lengthes = nullptr);
  37. /**
  38. * decode HTML entities if any
  39. * @param src input buffer
  40. * @param dst output buffer
  41. * @param dstlen output buffer length
  42. * @param cpsrc input buffer encoding, ascii-compatible
  43. * @param cpdst output buffer encoding, if different from cpsrc
  44. * @return src if no entities and encodings are the same (dst remains untouched)
  45. * NULL if dst was not sufficiently long
  46. * dst-based output buffer with decoded string
  47. * @note entities must be pure, with the terminating ";"
  48. */
  49. TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc = CODES_UTF8);
  50. TStringBuf HtTryEntDecodeAsciiCompat(const TStringBuf& src, char* dst, size_t dstlen, ECharset cpsrc, ECharset cpdst);
  51. //! decodes HTML entities and converts non-ASCII characters to unicode, then converts unicode to UTF8 and percent-encodes
  52. //! @param text zero-terminated text of link
  53. //! @param buffer buffer receiving UTF8 percent-encoded text of link
  54. //! @param buflen length of output buffer
  55. //! @param cp code page object used to convert non-ASCII characters
  56. //! @note HTML entities directly converted into unicode characters, non-ASCII characters
  57. //! converted into unicode using code page object if it is passed to the function,
  58. //! then unicode characters converted to UTF8 and percent-encoded,
  59. //! percent-encoded text in the link copied into output buffer as is
  60. bool HtLinkDecode(const char* text, char* buffer, size_t buflen, size_t& written, ECharset cp = CODES_UNKNOWN);
  61. bool HtLinkDecode(const TStringBuf& text, char* buffer, size_t buflen, size_t& written, ECharset cp = CODES_UNKNOWN);
  62. static inline bool HtLinkDecode(const char* text, char* buffer, size_t buflen, ECharset cp = CODES_UNKNOWN) {
  63. size_t written;
  64. const bool ok = HtLinkDecode(text, buffer, buflen, written, cp);
  65. if (ok)
  66. buffer[written] = '\x00';
  67. return ok;
  68. }