assign.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. #include "uri.h"
  2. #include "parse.h"
  3. #include <idna.h>
  4. #include <library/cpp/charset/recyr.hh>
  5. #include <util/charset/wide.h>
  6. #include <util/memory/tempbuf.h>
  7. #include <util/string/cast.h>
  8. #include <util/system/yassert.h>
  9. #include <util/system/sys_alloc.h>
  10. namespace NUri {
  11. TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
  12. // XXX: don't use punycode_encode directly as it doesn't include
  13. // proper stringprep and splitting on dot-equivalent characters
  14. char* buf;
  15. static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
  16. if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*)idna, &buf, 0))
  17. buf = nullptr;
  18. return buf;
  19. }
  20. TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
  21. TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
  22. wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());
  23. const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
  24. wbuf[written] = 0;
  25. return IDNToAscii(wbuf);
  26. }
  27. TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
  28. TStringBuf outhost; // store the result here before returning it, to get RVO
  29. size_t buflen = 0;
  30. if (hasExtended && !allowIDN)
  31. return outhost; // definitely can't convert
  32. // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
  33. // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
  34. const bool recoding = CODES_UTF8 != enc && hasExtended;
  35. if (recoding) {
  36. size_t nrd, nwr;
  37. buflen = host.length() * 4;
  38. buf.Reset(static_cast<char*>(y_allocate(buflen)));
  39. if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr))
  40. return outhost;
  41. host = TStringBuf(buf.Get(), nwr);
  42. }
  43. // percent-decode
  44. if (0 == buflen) {
  45. buflen = host.length();
  46. buf.Reset(static_cast<char*>(y_allocate(buflen)));
  47. }
  48. // decoding shortens so writing over host in buf is OK
  49. TMemoryWriteBuffer out(buf.Get(), buflen);
  50. TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
  51. const long outFlags = decoder.ReEncode(host);
  52. hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);
  53. // check again
  54. if (hasExtended && !allowIDN)
  55. return outhost;
  56. host = out.Str();
  57. // convert to punycode if needed
  58. if (!hasExtended) {
  59. outhost = host;
  60. return outhost;
  61. }
  62. TMallocPtr<char> puny;
  63. try {
  64. puny = IDNToAscii(host);
  65. } catch (const yexception& /* exc */) {
  66. }
  67. if (!puny) {
  68. // XXX: try user charset unless UTF8 or converted to it
  69. if (CODES_UTF8 == enc || recoding)
  70. return outhost;
  71. try {
  72. puny = IDNToAscii(host, enc);
  73. } catch (const yexception& /* exc */) {
  74. return outhost;
  75. }
  76. if (!puny)
  77. return outhost;
  78. }
  79. buf = puny;
  80. outhost = buf.Get();
  81. return outhost;
  82. }
  83. TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
  84. // find what we have
  85. long haveFlags = 0;
  86. for (size_t i = 0; i != host.length(); ++i)
  87. haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;
  88. // interested in encoded characters or (if IDN is allowed) extended ascii
  89. TStringBuf outhost;
  90. const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;
  91. if (!haveExtended || allowIDN) {
  92. if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY))
  93. outhost = host;
  94. else
  95. outhost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
  96. }
  97. return outhost;
  98. }
  99. static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField fld, const TStringBuf& val, long flags) {
  100. if (val.empty())
  101. return false;
  102. if (flags & TFeature::FeaturesAllEncoder)
  103. TUri::ReEncodeField(out, val, fld, flags);
  104. else
  105. out << val;
  106. return true;
  107. }
  108. TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defscheme) {
  109. Clear();
  110. TState::EParsed ret = parser.State;
  111. if (ParsedBadFormat <= ret)
  112. return ret;
  113. const TSection& scheme = parser.Get(FieldScheme);
  114. const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);
  115. // set the scheme always if available
  116. if (schemeInfo.Str.empty() && scheme.IsSet())
  117. FldSet(FieldScheme, scheme.Get());
  118. if (ParsedOK != ret)
  119. return ret;
  120. size_t buflen = 0;
  121. // special processing for fields
  122. const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
  123. long flags = parser.Flags.Allow;
  124. if (convertIDN)
  125. flags |= FeatureAllowHostIDN | FeatureCheckHost;
  126. // process non-ASCII host for punycode
  127. TMallocPtr<char> hostptr;
  128. TStringBuf hostascii; // empty: use host field; non-empty: ascii
  129. bool hostConverted = false; // hostascii is empty or the original
  130. const TSection& host = parser.Get(FieldHost);
  131. if (host.IsSet() && !FldIsSet(FieldHost)) {
  132. const bool allowIDN = (flags & FeatureAllowHostIDN);
  133. const TStringBuf hostbuf = host.Get();
  134. // if we know we have and allow extended-ASCII chars, no need to check further
  135. if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII))
  136. hostascii = HostToAscii(hostbuf, hostptr, true, true, parser.Enc);
  137. else
  138. hostascii = HostToAscii(hostbuf, hostptr, allowIDN, parser.Enc);
  139. if (hostascii.empty())
  140. ret = ParsedBadHost; // exists but cannot be converted
  141. else if (hostbuf.data() != hostascii.data()) {
  142. hostConverted = true;
  143. buflen += 1 + hostascii.length();
  144. if (convertIDN)
  145. FldMarkSet(FieldHost); // so that we don't process host below
  146. }
  147. }
  148. // add unprocessed fields
  149. for (int idx = 0; idx < FieldUrlMAX; ++idx) {
  150. const EField fld = EField(idx);
  151. const TSection& section = parser.Get(fld);
  152. if (section.IsSet() && !FldIsSet(fld))
  153. buflen += 1 + section.EncodedLen(); // includes null
  154. }
  155. if (0 == buflen) // no more sections set?
  156. return ret;
  157. // process #! fragments
  158. // https://developers.google.com/webmasters/ajax-crawling/docs/specification
  159. static const TStringBuf escFragPrefix(TStringBuf("_escaped_fragment_="));
  160. bool encHashBangFrag = false;
  161. TStringBuf qryBeforeEscapedFragment;
  162. TStringBuf qryEscapedFragment;
  163. do {
  164. if (FldIsSet(FieldFrag) || FldIsSet(FieldQuery))
  165. break;
  166. const TSection& frag = parser.Get(FieldFrag);
  167. if (frag.IsSet()) {
  168. if (0 == (parser.Flags & FeatureHashBangToEscapedFragment))
  169. break;
  170. const TStringBuf fragbuf = frag.Get();
  171. if (fragbuf.empty() || '!' != fragbuf[0])
  172. break;
  173. encHashBangFrag = true;
  174. // '!' will make space for '&' or '\0' if needed
  175. buflen += escFragPrefix.length();
  176. buflen += 2 * fragbuf.length(); // we don't know how many will be encoded
  177. } else {
  178. const TSection& qry = parser.Get(FieldQuery);
  179. if (!qry.IsSet())
  180. break;
  181. // FeatureHashBangToEscapedFragment has preference
  182. if (FeatureEscapedToHashBangFragment != (parser.Flags & FeaturesEscapedFragment))
  183. break;
  184. qry.Get().RSplit('&', qryBeforeEscapedFragment, qryEscapedFragment);
  185. if (!qryEscapedFragment.StartsWith(escFragPrefix)) {
  186. qryEscapedFragment.Clear();
  187. break;
  188. }
  189. qryEscapedFragment.Skip(escFragPrefix.length());
  190. buflen += 2; // for '!' and '\0' in fragment
  191. buflen -= escFragPrefix.length();
  192. }
  193. } while (false);
  194. // now set all fields prior to validating
  195. Alloc(buflen);
  196. TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
  197. for (int idx = 0; idx < FieldUrlMAX; ++idx) {
  198. const EField fld = EField(idx);
  199. const TSection& section = parser.Get(fld);
  200. if (!section.IsSet() || FldIsSet(fld))
  201. continue;
  202. if (FieldQuery == fld && encHashBangFrag)
  203. continue;
  204. if (FieldFrag == fld && qryEscapedFragment.IsInited())
  205. continue;
  206. char* beg = out.Buf();
  207. TStringBuf val = section.Get();
  208. long careFlags = section.GetFlagsEncode();
  209. switch (fld) {
  210. default:
  211. break;
  212. case FieldQuery:
  213. if (qryEscapedFragment.IsInited()) {
  214. const EField dstfld = FieldFrag; // that's where we will store
  215. out << '!';
  216. if (!qryEscapedFragment.empty())
  217. ReEncodeToField(out, qryEscapedFragment, fld, FeatureDecodeANY | careFlags, dstfld, FeatureDecodeANY | parser.GetFieldFlags(dstfld));
  218. FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
  219. if (qryBeforeEscapedFragment.empty())
  220. continue;
  221. out << '\0';
  222. beg = out.Buf();
  223. val = qryBeforeEscapedFragment;
  224. }
  225. break;
  226. case FieldFrag:
  227. if (encHashBangFrag) {
  228. const EField dstfld = FieldQuery; // that's where we will store
  229. const TSection& qry = parser.Get(dstfld);
  230. if (qry.IsSet())
  231. if (AppendField(out, dstfld, qry.Get(), qry.GetFlagsEncode()))
  232. out << '&';
  233. out << escFragPrefix;
  234. val.Skip(1); // skip '!'
  235. ReEncodeToField(out, val, fld, careFlags, dstfld, parser.GetFieldFlags(dstfld));
  236. FldSetNoDirty(dstfld, TStringBuf(beg, out.Buf()));
  237. continue;
  238. }
  239. break;
  240. }
  241. AppendField(out, fld, val, careFlags);
  242. char* end = out.Buf();
  243. if (careFlags & FeaturePathOperation) {
  244. if (!PathOperation(beg, end, PathOperationFlag(parser.Flags)))
  245. return ParsedBadPath;
  246. Y_ASSERT(beg >= out.Beg());
  247. out.SetPos(end);
  248. }
  249. FldSetNoDirty(fld, TStringBuf(beg, end));
  250. // special character case
  251. const long checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
  252. if (0 != checkChars) { // has unencoded special chars: check permission
  253. const long allowChars = parser.GetFieldFlags(fld) & checkChars;
  254. if (checkChars != allowChars)
  255. ret = ParsedBadFormat;
  256. }
  257. out << '\0';
  258. }
  259. if (hostConverted) {
  260. char* beg = out.Buf();
  261. out << hostascii;
  262. char* end = out.Buf();
  263. const EField fld = convertIDN ? FieldHost : FieldHostAscii;
  264. FldSetNoDirty(fld, TStringBuf(beg, end));
  265. out << '\0';
  266. }
  267. Buffer.Resize(out.Len());
  268. if (GetScheme() == SchemeEmpty && SchemeEmpty != defscheme) {
  269. if (SchemeUnknown == defscheme)
  270. ret = ParsedBadScheme;
  271. else
  272. SetSchemeImpl(defscheme);
  273. }
  274. if (0 == (parser.Flags & FeatureAllowEmptyPath))
  275. CheckMissingFields();
  276. const TStringBuf& port = GetField(FieldPort);
  277. if (!port.empty()) {
  278. if (!TryFromString<ui16>(port, Port))
  279. ret = ParsedBadPort;
  280. }
  281. if (ParsedOK != ret)
  282. return ret;
  283. // run validity checks now that all fields are set
  284. // check the host for DNS compliance
  285. do {
  286. if (0 == (flags & FeatureCheckHost))
  287. break;
  288. if (hostascii.empty())
  289. hostascii = GetField(FieldHost);
  290. if (hostascii.empty())
  291. break;
  292. // IP literal
  293. if ('[' == hostascii[0] && ']' == hostascii.back())
  294. break;
  295. ret = CheckHost(hostascii);
  296. if (ParsedOK != ret)
  297. return ret;
  298. } while (false);
  299. return ret;
  300. }
  301. TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
  302. Clear();
  303. if (url.empty())
  304. return ParsedEmpty;
  305. if (maxlen > 0 && url.length() > maxlen)
  306. return ParsedTooLong;
  307. const TParser parser(flags, url, enc);
  308. return AssignImpl(parser, defscheme);
  309. }
  310. TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
  311. const TParseFlags flags1 = flags.Exclude(FeatureNoRelPath);
  312. TState::EParsed ret = ParseImpl(url, url_base.empty() ? flags : flags1, maxlen, SchemeEmpty, enc);
  313. if (ParsedOK != ret)
  314. return ret;
  315. if (!url_base.empty() && !IsValidAbs()) {
  316. TUri base;
  317. ret = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
  318. if (ParsedOK != ret)
  319. return ret;
  320. Merge(base, PathOperationFlag(flags));
  321. }
  322. Rewrite();
  323. return ret;
  324. }
  325. TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
  326. const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
  327. if (ParsedOK != ret)
  328. return ret;
  329. if (!IsValidAbs())
  330. Merge(base, PathOperationFlag(flags));
  331. Rewrite();
  332. return ret;
  333. }
  334. TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defscheme, ECharset enc) {
  335. const TState::EParsed ret = ParseImpl(
  336. url, flags | FeatureNoRelPath, maxlen, defscheme, enc);
  337. if (ParsedOK != ret)
  338. return ret;
  339. if (IsNull(FlagHost))
  340. return ParsedBadHost;
  341. Rewrite();
  342. return ParsedOK;
  343. }
  344. }