assign.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. #include "uri.h"
  2. #include "parse.h"
  3. #include <idna.h>
  4. #include <library/cpp/charset/recyr.hh>
  5. #include <util/charset/wide.h>
  6. #include <util/memory/tempbuf.h>
  7. #include <util/string/cast.h>
  8. #include <util/system/yassert.h>
  9. #include <util/system/sys_alloc.h>
  10. namespace NUri {
  11. static const TStringBuf ESCAPED_FRAGMENT(TStringBuf("_escaped_fragment_="));
  12. TMallocPtr<char> TUri::IDNToAscii(const wchar32* idna) {
  13. // XXX: don't use punycode_encode directly as it doesn't include
  14. // proper stringprep and splitting on dot-equivalent characters
  15. char* buf;
  16. static_assert(sizeof(*idna) == sizeof(ui32), "fixme");
  17. if (IDNA_SUCCESS != idna_to_ascii_4z((const uint32_t*) idna, &buf, 0)) {
  18. buf = nullptr;
  19. }
  20. return buf;
  21. }
  22. TMallocPtr<char> TUri::IDNToAscii(const TStringBuf& host, ECharset enc) {
  23. TTempBuf buf(sizeof(wchar32) * (1 + host.length()));
  24. wchar32* wbuf = reinterpret_cast<wchar32*>(buf.Data());
  25. const size_t written = NDetail::NBaseOps::Recode(host, wbuf, enc).length();
  26. wbuf[written] = 0;
  27. return IDNToAscii(wbuf);
  28. }
  29. TStringBuf TUri::HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc) {
  30. TStringBuf outHost; // store the result here before returning it, to get RVO
  31. size_t buflen = 0;
  32. if (hasExtended && !allowIDN) {
  33. return outHost; // definitely can't convert
  34. }
  35. // charset-recode: RFC 3986, 3.2.2, requires percent-encoded non-ASCII
  36. // chars in reg-name to be UTF-8 so convert to UTF-8 prior to decoding
  37. const bool recoding = CODES_UTF8 != enc && hasExtended;
  38. if (recoding) {
  39. size_t nrd, nwr;
  40. buflen = host.length() * 4;
  41. buf.Reset(static_cast<char*>(y_allocate(buflen)));
  42. if (RECODE_OK != Recode(enc, CODES_UTF8, host.data(), buf.Get(), host.length(), buflen, nrd, nwr)) {
  43. return outHost;
  44. }
  45. host = TStringBuf(buf.Get(), nwr);
  46. }
  47. // percent-decode
  48. if (0 == buflen) {
  49. buflen = host.length();
  50. buf.Reset(static_cast<char*>(y_allocate(buflen)));
  51. }
  52. // decoding shortens so writing over host in buf is OK
  53. TMemoryWriteBuffer out(buf.Get(), buflen);
  54. TEncoder decoder(out, FeatureDecodeANY | FeatureToLower);
  55. const ui64 outFlags = decoder.ReEncode(host);
  56. hasExtended = 0 != (outFlags & FeatureEncodeExtendedASCII);
  57. // check again
  58. if (hasExtended && !allowIDN) {
  59. return outHost;
  60. }
  61. host = out.Str();
  62. // convert to punycode if needed
  63. if (!hasExtended) {
  64. outHost = host;
  65. return outHost;
  66. }
  67. TMallocPtr<char> puny;
  68. try {
  69. puny = IDNToAscii(host);
  70. } catch (const yexception& /* exc */) {
  71. }
  72. if (!puny) {
  73. // XXX: try user charset unless UTF8 or converted to it
  74. if (CODES_UTF8 == enc || recoding) {
  75. return outHost;
  76. }
  77. try {
  78. puny = IDNToAscii(host, enc);
  79. } catch (const yexception& /* exc */) {
  80. return outHost;
  81. }
  82. if (!puny) {
  83. return outHost;
  84. }
  85. }
  86. buf = puny;
  87. outHost = buf.Get();
  88. return outHost;
  89. }
  90. TStringBuf TUri::HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc) {
  91. // find what we have
  92. ui64 haveFlags = 0;
  93. for (size_t i = 0; i != host.length(); ++i) {
  94. haveFlags |= TEncoder::GetFlags(host[i]).FeatFlags;
  95. }
  96. // interested in encoded characters or (if IDN is allowed) extended ascii
  97. TStringBuf outHost;
  98. const bool haveExtended = haveFlags & FeatureEncodeExtendedASCII;
  99. if (!haveExtended || allowIDN) {
  100. if (!haveExtended && 0 == (haveFlags & FeatureDecodeANY)) {
  101. outHost = host;
  102. } else {
  103. outHost = HostToAscii(host, buf, haveExtended, allowIDN, enc);
  104. }
  105. }
  106. return outHost;
  107. }
  108. static inline bool AppendField(TMemoryWriteBuffer& out, TField::EField field, const TStringBuf& value, ui64 flags) {
  109. if (value.empty()) {
  110. return false;
  111. }
  112. if (flags & TFeature::FeaturesAllEncoder) {
  113. TUri::ReEncodeField(out, value, field, flags);
  114. } else {
  115. out << value;
  116. }
  117. return true;
  118. }
  119. class THashBangModifier {
  120. public:
  121. TStringBuf HashBang;
  122. TStringBuf Query;
  123. bool FromFragmentToHashBang = false;
  124. bool FromQueryToFragment = false;
  125. bool FromFragmentToQuery = false;
  126. THashBangModifier() = default;
  127. bool ParseHashBangFromFragment(const TParser& parser) {
  128. const TSection& fragment = parser.Get(TField::FieldFragment);
  129. if (fragment.IsSet()) {
  130. HashBang = fragment.Get();
  131. if (!HashBang.empty() && '!' == HashBang[0]) {
  132. HashBang.Skip(1); // remove !
  133. return true;
  134. }
  135. }
  136. return false;
  137. }
  138. bool ParseHashBangFromQuery(const TParser& parser) {
  139. const TSection& query = parser.Get(TField::FieldQuery);
  140. if (query.IsSet()) {
  141. query.Get().RSplit('&', Query, HashBang);
  142. if (HashBang.StartsWith(ESCAPED_FRAGMENT)) {
  143. HashBang.Skip(ESCAPED_FRAGMENT.length());
  144. return true;
  145. }
  146. }
  147. return false;
  148. }
  149. void Parse(const TParser& parser, size_t& buflen) {
  150. if (0 != (parser.Flags & TFeature::FeatureFragmentToHashBang)) {
  151. if (ParseHashBangFromFragment(parser)) {
  152. FromFragmentToHashBang = true;
  153. buflen += 1; // for '\0'
  154. buflen += 2 * HashBang.length(); // encode
  155. }
  156. } else if (0 != (parser.Flags & TFeature::FeatureHashBangToEscapedFragment)) {
  157. if (ParseHashBangFromFragment(parser)) {
  158. FromFragmentToQuery = true;
  159. buflen += ESCAPED_FRAGMENT.length();
  160. buflen += 2 * HashBang.length(); // encode
  161. }
  162. } else if (0 != (parser.Flags & TFeature::FeatureEscapedToHashBangFragment)) {
  163. if (ParseHashBangFromQuery(parser)) {
  164. FromQueryToFragment = true;
  165. buflen += 2; // for '!' and '\0'
  166. buflen -= ESCAPED_FRAGMENT.length();
  167. }
  168. }
  169. }
  170. bool AppendQuery(TMemoryWriteBuffer& out, const TParser& parser) const {
  171. const TSection& query = parser.Get(TField::FieldQuery);
  172. if (FromQueryToFragment) {
  173. return AppendField(out, TField::FieldQuery, Query, query.GetFlagsEncode());
  174. }
  175. if (FromFragmentToQuery) {
  176. if (AppendField(out, TField::FieldQuery, query.Get(), query.GetFlagsEncode())) {
  177. out << '&';
  178. }
  179. out << ESCAPED_FRAGMENT;
  180. const TSection& fragment = parser.Get(TField::FieldFragment);
  181. TUri::ReEncodeToField(
  182. out, HashBang,
  183. TField::FieldFragment, fragment.GetFlagsEncode(),
  184. TField::FieldQuery, parser.GetFieldFlags(TField::FieldQuery)
  185. );
  186. return true;
  187. }
  188. if (!query.IsSet()) {
  189. return false;
  190. }
  191. AppendField(out, TField::FieldQuery, query.Get(), query.GetFlagsEncode());
  192. return true; // may be empty
  193. }
  194. bool AppendHashBang(TMemoryWriteBuffer& out, const TParser& parser) const {
  195. if (FromFragmentToHashBang) {
  196. const TSection& fragment = parser.Get(TField::FieldFragment);
  197. TUri::ReEncodeToField(
  198. out, HashBang,
  199. TField::FieldFragment, fragment.GetFlagsEncode(),
  200. TField::FieldHashBang, parser.GetFieldFlags(TField::FieldHashBang)
  201. );
  202. return true;
  203. }
  204. return false;
  205. }
  206. bool AppendFragment(TMemoryWriteBuffer& out, const TParser& parser) const {
  207. if (FromFragmentToQuery || FromFragmentToHashBang) {
  208. return false;
  209. }
  210. if (FromQueryToFragment) {
  211. const TSection& query = parser.Get(TField::FieldQuery);
  212. out << '!';
  213. TUri::ReEncodeToField(
  214. out, HashBang,
  215. TField::FieldQuery, TFeature::FeatureDecodeANY | query.GetFlagsEncode(),
  216. TField::FieldFragment, TFeature::FeatureDecodeANY | parser.GetFieldFlags(TField::FieldFragment)
  217. );
  218. return true;
  219. }
  220. const TSection& fragment = parser.Get(TField::FieldFragment);
  221. if (!fragment.IsSet()) {
  222. return false;
  223. }
  224. AppendField(out, TField::FieldQuery, fragment.Get(), fragment.GetFlagsEncode());
  225. return true;
  226. }
  227. };
  228. TState::EParsed TUri::AssignImpl(const TParser& parser, TScheme::EKind defaultScheme) {
  229. Clear();
  230. TState::EParsed status = parser.State;
  231. if (ParsedBadFormat <= status) {
  232. return status;
  233. }
  234. const TSection& scheme = parser.Get(FieldScheme);
  235. const TSchemeInfo& schemeInfo = SetSchemeImpl(parser.Scheme);
  236. // set the scheme always if available
  237. if (schemeInfo.Str.empty() && scheme.IsSet()) {
  238. FldSet(FieldScheme, scheme.Get());
  239. }
  240. if (ParsedOK != status) {
  241. return status;
  242. }
  243. size_t buflen = 0;
  244. // special processing for fields
  245. const bool convertIDN = parser.Flags & FeatureConvertHostIDN;
  246. ui64 flags = parser.Flags.Allow;
  247. if (convertIDN) {
  248. flags |= FeatureAllowHostIDN | FeatureCheckHost;
  249. }
  250. // process non-ASCII host for punycode
  251. TMallocPtr<char> hostPtr;
  252. TStringBuf hostAsciiBuf;
  253. bool inHostNonAsciiChars = false;
  254. const TSection& host = parser.Get(FieldHost);
  255. if (host.IsSet() && !FldIsSet(FieldHost)) {
  256. const bool allowIDN = (flags & FeatureAllowHostIDN);
  257. const TStringBuf hostBuf = host.Get();
  258. // if we know we have and allow extended-ASCII chars, no need to check further
  259. if (allowIDN && (host.GetFlagsAllPlaintext() & FeatureEncodeExtendedASCII)) {
  260. hostAsciiBuf = HostToAscii(hostBuf, hostPtr, true, true, parser.Enc);
  261. } else {
  262. hostAsciiBuf = HostToAscii(hostBuf, hostPtr, allowIDN, parser.Enc);
  263. }
  264. if (hostAsciiBuf.empty()) {
  265. status = ParsedBadHost; // exists but cannot be converted
  266. } else if (hostBuf.data() != hostAsciiBuf.data()) {
  267. inHostNonAsciiChars = true;
  268. buflen += 1 + hostAsciiBuf.length();
  269. if (convertIDN) {
  270. FldMarkSet(FieldHost); // so that we don't process host below
  271. }
  272. }
  273. }
  274. // add unprocessed fields
  275. for (ui32 i = 0; i < FieldUrlMAX; ++i) {
  276. const EField field = EField(i);
  277. const TSection& section = parser.Get(field);
  278. if (section.IsSet() && !FldIsSet(field)) {
  279. buflen += 1 + section.EncodedLen(); // includes null
  280. }
  281. }
  282. if (0 == buflen) { // no more sections set?
  283. return status;
  284. }
  285. // process #! fragments
  286. // https://developers.google.com/webmasters/ajax-crawling/docs/specification
  287. THashBangModifier modifier;
  288. if (!FldIsSet(FieldFragment) && !FldIsSet(FieldQuery)) {
  289. modifier.Parse(parser, buflen);
  290. }
  291. // now set all fields prior to validating
  292. Alloc(buflen);
  293. TMemoryWriteBuffer out(Buffer.data(), Buffer.size());
  294. for (ui32 i = 0; i < FieldUrlMAX; ++i) {
  295. const EField field = EField(i);
  296. if (FldIsSet(field)) {
  297. continue;
  298. }
  299. const TSection& section = parser.Get(field);
  300. char* beg = out.Buf();
  301. if (field == FieldQuery) {
  302. if (!modifier.AppendQuery(out, parser)) {
  303. continue;
  304. }
  305. } else if (field == FieldHashBang) {
  306. if (!modifier.AppendHashBang(out, parser)) {
  307. continue;
  308. }
  309. } else if (field == FieldFragment) {
  310. if (!modifier.AppendFragment(out, parser)) {
  311. continue;
  312. }
  313. } else {
  314. if (!section.IsSet()) {
  315. continue;
  316. }
  317. AppendField(out, field, section.Get(), section.GetFlagsEncode()); // may be empty
  318. }
  319. // path operations case
  320. char* end = out.Buf();
  321. if (section.GetFlagsEncode() & FeaturePathOperation) {
  322. if (!PathOperation(beg, end, PathOperationFlag(parser.Flags))) {
  323. return ParsedBadPath;
  324. }
  325. Y_ASSERT(beg >= out.Beg());
  326. out.SetPos(end);
  327. }
  328. FldSetNoDirty(field, TStringBuf(beg, end));
  329. out << '\0';
  330. // special character case
  331. const ui64 checkChars = section.GetFlagsAllPlaintext() & FeaturesCheckSpecialChar;
  332. if (0 != checkChars) { // has unencoded special chars: check permission
  333. const ui64 allowChars = parser.GetFieldFlags(field) & checkChars;
  334. if (checkChars != allowChars) {
  335. status = ParsedBadFormat;
  336. }
  337. }
  338. }
  339. if (inHostNonAsciiChars) {
  340. char* beg = out.Buf();
  341. out << hostAsciiBuf;
  342. auto field = convertIDN ? FieldHost : FieldHostAscii;
  343. FldSetNoDirty(field, TStringBuf(beg, out.Buf()));
  344. out << '\0';
  345. }
  346. Buffer.Resize(out.Len());
  347. if (GetScheme() == SchemeEmpty && SchemeEmpty != defaultScheme) {
  348. if (SchemeUnknown == defaultScheme) {
  349. status = ParsedBadScheme;
  350. } else {
  351. SetSchemeImpl(defaultScheme);
  352. }
  353. }
  354. if (0 == (parser.Flags & FeatureAllowEmptyPath)) {
  355. CheckMissingFields();
  356. }
  357. const TStringBuf& port = GetField(FieldPort);
  358. if (!port.empty() && !TryFromString<ui16>(port, Port)) {
  359. return ParsedBadPort;
  360. }
  361. if (ParsedOK != status) {
  362. return status;
  363. }
  364. // run validity checks now that all fields are set
  365. // check the host for DNS compliance
  366. if (0 != (flags & FeatureCheckHost)) {
  367. if (hostAsciiBuf.empty()) {
  368. hostAsciiBuf = GetField(FieldHost);
  369. }
  370. if (!hostAsciiBuf.empty()) {
  371. // IP literal
  372. if ('[' != hostAsciiBuf[0] || ']' != hostAsciiBuf.back()) {
  373. status = CheckHost(hostAsciiBuf);
  374. }
  375. }
  376. }
  377. return status;
  378. }
  379. TState::EParsed TUri::ParseImpl(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defaultScheme, ECharset enc) {
  380. Clear();
  381. if (url.empty()) {
  382. return ParsedEmpty;
  383. }
  384. if (maxlen > 0 && url.length() > maxlen) {
  385. return ParsedTooLong;
  386. }
  387. const TParser parser(flags, url, enc);
  388. return AssignImpl(parser, defaultScheme);
  389. }
  390. TState::EParsed TUri::Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& url_base, ui32 maxlen, ECharset enc) {
  391. const TParseFlags parseFlags = url_base.empty() ? flags : flags.Exclude(FeatureNoRelPath);
  392. TState::EParsed status = ParseImpl(url, parseFlags, maxlen, SchemeEmpty, enc);
  393. if (ParsedOK != status) {
  394. return status;
  395. }
  396. if (!url_base.empty() && !IsValidAbs()) {
  397. TUri base;
  398. status = base.ParseImpl(url_base, flags, maxlen, SchemeEmpty, enc);
  399. if (ParsedOK != status) {
  400. return status;
  401. }
  402. Merge(base, PathOperationFlag(flags));
  403. }
  404. Rewrite();
  405. return status;
  406. }
  407. TState::EParsed TUri::Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags, ui32 maxlen, ECharset enc) {
  408. const TState::EParsed status = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
  409. if (ParsedOK != status) {
  410. return status;
  411. }
  412. if (!IsValidAbs()) {
  413. Merge(base, PathOperationFlag(flags));
  414. }
  415. Rewrite();
  416. return status;
  417. }
  418. TState::EParsed TUri::ParseAbsUri(const TStringBuf& url, const TParseFlags& flags, ui32 maxlen, TScheme::EKind defaultScheme, ECharset enc) {
  419. const TState::EParsed status = ParseImpl(url, flags | FeatureNoRelPath, maxlen, defaultScheme, enc);
  420. if (ParsedOK != status) {
  421. return status;
  422. }
  423. if (IsNull(FlagHost)) {
  424. return ParsedBadHost;
  425. }
  426. Rewrite();
  427. return ParsedOK;
  428. }
  429. }