encode.cpp 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #include "encode.h"
  2. #include <util/generic/singleton.h>
  3. namespace NUri {
  4. namespace NEncode {
  5. // http://tools.ietf.org/html/rfc3986#section-2.2
  6. #define GENDELIMS0 ":/?#[]@"
  7. #define SUBDELIMS0 "!$&'()*+,;="
  8. // http://tools.ietf.org/html/rfc3986#section-2.3
  9. #define UNRESERVED "-._~"
  10. // now find subsets which can sometimes be decoded
  11. // remove '#' which can't ever be decoded
  12. // don't mark anything allowed for pass (pass is completely encoded)
  13. // safe in path, qry, frag, hashbang
  14. #define GENDELIMS1 ":@"
  15. // allowed in qry, frag, hashbang
  16. #define GENDELIMS2 "/?"
  17. // qry-unsafe chars
  18. #define SUBDELIMS1 "&+=;"
  19. // rest allowed in qry, frag, hashbang
  20. #define SUBDELIMS2 "!$'()*,"
  21. const TEncoder::TGrammar& TEncoder::Grammar() {
  22. return *Singleton<TEncoder::TGrammar>();
  23. }
  24. // initialize the grammar map
  25. TEncoder::TGrammar::TGrammar() {
  26. // first set up unreserved characters safe in any field
  27. const ui64 featUnres = TFeature::FeatureDecodeUnreserved;
  28. AddRng('0', '9', ECFDigit, featUnres);
  29. AddRng('A', 'Z', ECFUpper, featUnres | TFeature::FeatureToLower);
  30. AddRng('a', 'z', ECFLower, featUnres);
  31. Add(UNRESERVED, ECFUnres, featUnres);
  32. // XXX: standard "safe" set used previously "-_.!~*();/:@$,", with comment:
  33. // alnum + reserved + mark + ( '[', ']') - ('=' '+' '&' '\'' '"' '\\' '?')
  34. Add("!*();/:@$,", ECFStdrd, TFeature::FeatureDecodeStandardExtra);
  35. // now field-specific subsets of reserved characters (gen-delims + sub-delims)
  36. const ui64 featSafe = TFeature::FeatureDecodeFieldAllowed;
  37. Add(GENDELIMS1, 0, featSafe, TField::FlagPath | TField::FlagQuery | TField::FlagFrag | TField::FlagHashBang);
  38. Add(GENDELIMS2, 0, featSafe, TField::FlagQuery | TField::FlagFrag | TField::FlagHashBang);
  39. Add(SUBDELIMS1, 0, featSafe, TField::FlagUser);
  40. Add(SUBDELIMS2, 0, featSafe, TField::FlagUser | TField::FlagQuery | TField::FlagFrag | TField::FlagHashBang);
  41. // control chars
  42. AddRng(0x00, 0x20, TFeature::FeatureEncodeCntrl);
  43. Add(0x7f, TFeature::FeatureEncodeCntrl);
  44. // '%' starts a percent-encoded sequence
  45. Add('%', TFeature::FeatureDecodeANY | TFeature::FeatureEncodePercent);
  46. // extended ASCII
  47. AddRng(128, 255, TFeature::FeatureEncodeExtendedASCII | TFeature::FeatureDecodeExtendedASCII);
  48. // extended delims
  49. Add("\"<>[\\]^`{|}", TFeature::FeatureEncodeExtendedDelim | TFeature::FeatureDecodeExtendedDelim);
  50. // add characters with other features
  51. Add(' ', TFeature::FeatureEncodeSpace | TFeature::FeatureEncodeSpaceAsPlus);
  52. Add("'\"\\", TFeature::FeatureEncodeForSQL);
  53. GetMutable(':').EncodeFld |= TField::FlagUser | TField::FlagHashBang;
  54. GetMutable('?').EncodeFld |= TField::FlagPath | TField::FlagHashBang;
  55. GetMutable('#').EncodeFld |= TField::FlagPath | TField::FlagQuery | TField::FlagHashBang;
  56. GetMutable('&').EncodeFld |= TField::FlagQuery | TField::FlagHashBang;
  57. GetMutable('+').EncodeFld |= TField::FlagQuery | TField::FlagHashBang;
  58. }
  59. // should we decode an encoded character
  60. bool TCharFlags::IsDecode(ui32 fldmask, ui64 flags) const {
  61. const ui64 myflags = flags & FeatFlags;
  62. if (myflags & TFeature::FeaturesEncode)
  63. return false;
  64. if (myflags & TFeature::FeaturesDecode)
  65. return true;
  66. return (fldmask & DecodeFld) && (flags & TFeature::FeatureDecodeFieldAllowed);
  67. }
  68. const int dD = 'a' - 'A';
  69. int TEncodeMapper::EncodeSym(unsigned char& ch) const {
  70. const TCharFlags& chflags = TEncoder::GetFlags(ch);
  71. const ui64 flags = Flags & chflags.FeatFlags;
  72. if (flags & TFeature::FeatureToLower)
  73. ch += dD;
  74. if (Q_DecodeAny)
  75. return -1;
  76. if (flags & TFeature::FeaturesEncode)
  77. return 1;
  78. if (' ' == ch) {
  79. if (Q_EncodeSpcAsPlus)
  80. ch = '+';
  81. return 0;
  82. }
  83. return 0;
  84. }
  85. int TEncodeMapper::EncodeHex(unsigned char& ch) const {
  86. const TCharFlags& chflags = TEncoder::GetFlags(ch);
  87. const ui64 flags = Flags & chflags.FeatFlags;
  88. if (flags & TFeature::FeatureToLower)
  89. ch += dD;
  90. if (Q_DecodeAny)
  91. return -1;
  92. if (chflags.IsDecode(FldMask, Flags))
  93. return 0;
  94. if (' ' == ch) {
  95. if (!Q_EncodeSpcAsPlus)
  96. return 1;
  97. ch = '+';
  98. return 0;
  99. }
  100. return 1;
  101. }
  102. bool TEncodeToMapper::Encode(unsigned char ch) const {
  103. if (Q_DecodeAny)
  104. return false;
  105. const TCharFlags& chflags = TEncoder::GetFlags(ch);
  106. if (FldMask & chflags.EncodeFld)
  107. return true;
  108. const ui64 flags = Flags & chflags.FeatFlags;
  109. return (flags & TFeature::FeaturesEncode);
  110. }
  111. TEncoder::TEncoder(IOutputStream& out, const TEncodeMapper& fldsrc, const TEncodeToMapper& flddst)
  112. : Out(out)
  113. , FldSrc(fldsrc)
  114. , FldDst(flddst)
  115. , OutFlags(0)
  116. , HexValue(0)
  117. {
  118. }
  119. IOutputStream& TEncoder::Hex(IOutputStream& out, unsigned char val) {
  120. static const char sHexCodes[] = "0123456789ABCDEF";
  121. return out << sHexCodes[(val >> 4) & 0xF] << sHexCodes[val & 0xF];
  122. }
  123. IOutputStream& TEncoder::EncodeAll(IOutputStream& out, const TStringBuf& val) {
  124. for (size_t i = 0; i != val.length(); ++i)
  125. Encode(out, val[i]);
  126. return out;
  127. }
  128. IOutputStream& TEncoder::EncodeNotAlnum(IOutputStream& out, const TStringBuf& val) {
  129. for (size_t i = 0; i != val.length(); ++i) {
  130. const char c = val[i];
  131. if (IsAlnum(c))
  132. out << c;
  133. else
  134. Encode(out, c);
  135. }
  136. return out;
  137. }
  138. IOutputStream& TEncoder::EncodeField(
  139. IOutputStream& out, const TStringBuf& val, TField::EField fld) {
  140. const ui32 fldmask = ui32(1) << fld;
  141. for (size_t i = 0; i != val.length(); ++i) {
  142. const char ch = val[i];
  143. if (GetFlags(ch).IsAllowed(fldmask))
  144. out << ch;
  145. else
  146. Encode(out, ch);
  147. }
  148. return out;
  149. }
  150. IOutputStream& TEncoder::EncodeField(
  151. IOutputStream& out, const TStringBuf& val, TField::EField fld, ui64 flags) {
  152. const ui32 fldmask = ui32(1) << fld;
  153. for (size_t i = 0; i != val.length(); ++i) {
  154. const char ch = val[i];
  155. if (GetFlags(ch).IsDecode(fldmask, flags))
  156. out << ch;
  157. else
  158. Encode(out, ch);
  159. }
  160. return out;
  161. }
  162. void TEncoder::Do(unsigned char ch, int res) {
  163. OutFlags |= GetFlags(ch).FeatFlags;
  164. bool escapepct = false;
  165. if (0 < res) // definitely encode
  166. escapepct = FldDst.Enabled() && !FldDst.Is(TField::FieldHashBang);
  167. else if (0 != res || !FldDst.Enabled() || !FldDst.Encode(ch)) {
  168. Out << ch;
  169. return;
  170. }
  171. Out << '%';
  172. if (escapepct) {
  173. Out.Write("25", 2); // '%'
  174. }
  175. Hex(Out, ch);
  176. }
  177. }
  178. }