uri.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. #pragma once
  2. #include "common.h"
  3. #include "encode.h"
  4. #include <library/cpp/charset/doccodes.h>
  5. #include <util/generic/buffer.h>
  6. #include <util/generic/ptr.h>
  7. #include <util/generic/singleton.h>
  8. #include <util/generic/string.h>
  9. #include <util/memory/alloc.h>
  10. #include <util/stream/mem.h>
  11. #include <util/stream/output.h>
  12. #include <util/stream/str.h>
  13. #include <util/system/yassert.h>
  14. #include <cstdlib>
  15. namespace NUri {
  16. /********************************************************/
  17. class TUri
  18. : public TFeature,
  19. public TField,
  20. public TScheme,
  21. public TState {
  22. public:
  23. enum TLinkType {
  24. LinkIsBad,
  25. LinkBadAbs,
  26. LinkIsFragment,
  27. LinkIsLocal,
  28. LinkIsGlobal
  29. };
  30. private:
  31. TBuffer Buffer;
  32. TStringBuf Fields[FieldAllMAX];
  33. ui32 FieldsSet;
  34. ui16 Port;
  35. ui16 DefaultPort;
  36. TScheme::EKind Scheme;
  37. /// contains fields out of buffer (and possibly not null-terminated)
  38. ui32 FieldsDirty;
  39. private:
  40. void Alloc(size_t len) {
  41. Dealloc(); // to prevent copy below
  42. Buffer.Resize(len);
  43. }
  44. void Dealloc() {
  45. Buffer.Clear();
  46. }
  47. void ClearImpl() {
  48. Port = 0;
  49. FieldsSet = 0;
  50. Scheme = SchemeEmpty;
  51. FieldsDirty = 0;
  52. }
  53. void CopyData(const TUri& url) {
  54. FieldsSet = url.FieldsSet;
  55. Port = url.Port;
  56. DefaultPort = url.DefaultPort;
  57. Scheme = url.Scheme;
  58. FieldsDirty = url.FieldsDirty;
  59. }
  60. void CopyImpl(const TUri& url) {
  61. for (int i = 0; i < FieldAllMAX; ++i)
  62. Fields[i] = url.Fields[i];
  63. RewriteImpl();
  64. }
  65. private:
  66. static ui32 FldFlag(EField fld) {
  67. return 1 << fld;
  68. }
  69. public:
  70. static bool FldIsValid(EField fld) {
  71. return 0 <= fld && FieldAllMAX > fld;
  72. }
  73. bool FldSetCmp(ui32 chk, ui32 exp) const {
  74. return (FieldsSet & chk) == exp;
  75. }
  76. bool FldSetCmp(ui32 chk) const {
  77. return FldSetCmp(chk, chk);
  78. }
  79. bool FldIsSet(EField fld) const {
  80. return !FldSetCmp(FldFlag(fld), 0);
  81. }
  82. private:
  83. void FldMarkSet(EField fld) {
  84. FieldsSet |= FldFlag(fld);
  85. }
  86. void FldMarkUnset(EField fld) {
  87. FieldsSet &= ~FldFlag(fld);
  88. }
  89. // use when we know the field is dirty or RewriteImpl will be called
  90. void FldSetNoDirty(EField fld, const TStringBuf& value) {
  91. Fields[fld] = value;
  92. FldMarkSet(fld);
  93. }
  94. void FldSet(EField fld, const TStringBuf& value) {
  95. FldSetNoDirty(fld, value);
  96. FldMarkDirty(fld);
  97. }
  98. const TStringBuf& FldGet(EField fld) const {
  99. return Fields[fld];
  100. }
  101. private:
  102. /// depending on value, clears or sets it
  103. void FldChkSet(EField fld, const TStringBuf& value) {
  104. if (value.IsInited())
  105. FldSet(fld, value);
  106. else
  107. FldClr(fld);
  108. }
  109. void FldChkSet(EField fld, const TUri& other) {
  110. FldChkSet(fld, other.GetField(fld));
  111. }
  112. /// set only if initialized
  113. bool FldTrySet(EField fld, const TStringBuf& value) {
  114. const bool ok = value.IsInited();
  115. if (ok)
  116. FldSet(fld, value);
  117. return ok;
  118. }
  119. bool FldTrySet(EField fld, const TUri& other) {
  120. return FldTrySet(fld, other.GetField(fld));
  121. }
  122. private:
  123. /// copies the value if it fits
  124. bool FldTryCpy(EField fld, const TStringBuf& value);
  125. // main method: sets the field value, possibly copies, etc.
  126. bool FldSetImpl(EField fld, TStringBuf value, bool strconst = false, bool nocopy = false);
  127. public: // clear a field
  128. void FldClr(EField fld) {
  129. Fields[fld].Clear();
  130. FldMarkUnset(fld);
  131. FldMarkClean(fld);
  132. }
  133. bool FldTryClr(EField field) {
  134. const bool ok = FldIsSet(field);
  135. if (ok)
  136. FldClr(field);
  137. return ok;
  138. }
  139. public: // set a field value: might leave state dirty and require a Rewrite()
  140. // copies if fits and not dirty, sets and marks dirty otherwise
  141. bool FldMemCpy(EField field, const TStringBuf& value) {
  142. return FldSetImpl(field, value, false);
  143. }
  144. // uses directly, marks dirty
  145. /// @note client MUST guarantee value will be alive until Rewrite is called
  146. bool FldMemSet(EField field, const TStringBuf& value) {
  147. return FldSetImpl(field, value, false, true);
  148. }
  149. // uses directly, doesn't mark dirty (value scope exceeds "this")
  150. bool FldMemUse(EField field, const TStringBuf& value) {
  151. return FldSetImpl(field, value, true);
  152. }
  153. // uses directly, doesn't mark dirty
  154. template <size_t size>
  155. bool FldMemSet(EField field, const char (&value)[size]) {
  156. static_assert(size > 0);
  157. return FldSetImpl(field, TStringBuf(value, size - 1), true);
  158. }
  159. // duplicate one field to another
  160. bool FldDup(EField src, EField dst) {
  161. if (!FldIsSet(src) || !FldIsValid(dst))
  162. return false;
  163. FldSetNoDirty(dst, FldGet(src));
  164. if (FldIsDirty(src))
  165. FldMarkDirty(dst);
  166. else
  167. FldMarkClean(dst);
  168. return true;
  169. }
  170. // move one field to another
  171. bool FldMov(EField src, EField dst) {
  172. if (!FldDup(src, dst))
  173. return false;
  174. FldClr(src);
  175. return true;
  176. }
  177. private:
  178. bool IsInBuffer(const char* buf) const {
  179. return buf >= Buffer.data() && buf < Buffer.data() + Buffer.size();
  180. }
  181. public:
  182. bool FldIsDirty() const {
  183. return 0 != FieldsDirty;
  184. }
  185. bool FldIsDirty(EField fld) const {
  186. return 0 != (FieldsDirty & FldFlag(fld));
  187. }
  188. private:
  189. void FldMarkDirty(EField fld) {
  190. FieldsDirty |= FldFlag(fld);
  191. }
  192. void FldMarkClean(EField fld) {
  193. FieldsDirty &= ~FldFlag(fld);
  194. }
  195. void RewriteImpl();
  196. public:
  197. static TState::EParsed CheckHost(const TStringBuf& host);
  198. // convert a [potential] IDN to ascii
  199. static TMallocPtr<char> IDNToAscii(const wchar32* idna);
  200. static TMallocPtr<char> IDNToAscii(const TStringBuf& host, ECharset enc = CODES_UTF8);
  201. // convert hosts with percent-encoded or extended chars
  202. // returns non-empty string if host can be converted to ASCII with given parameters
  203. static TStringBuf HostToAscii(TStringBuf host, TMallocPtr<char>& buf, bool hasExtended, bool allowIDN, ECharset enc = CODES_UTF8);
  204. // returns host if already ascii, or non-empty if it can be converted
  205. static TStringBuf HostToAscii(const TStringBuf& host, TMallocPtr<char>& buf, bool allowIDN, ECharset enc = CODES_UTF8);
  206. public:
  207. explicit TUri(unsigned defaultPort = 0)
  208. : FieldsSet(0)
  209. , Port(0)
  210. , DefaultPort(static_cast<ui16>(defaultPort))
  211. , Scheme(SchemeEmpty)
  212. , FieldsDirty(0)
  213. {
  214. }
  215. TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query = TStringBuf(), const TStringBuf& scheme = "http", unsigned defaultPort = 0, const TStringBuf& hashbang = TStringBuf());
  216. TUri(const TUri& url)
  217. : FieldsSet(url.FieldsSet)
  218. , Port(url.Port)
  219. , DefaultPort(url.DefaultPort)
  220. , Scheme(url.Scheme)
  221. , FieldsDirty(url.FieldsDirty)
  222. {
  223. CopyImpl(url);
  224. }
  225. ~TUri() {
  226. Clear();
  227. }
  228. void Copy(const TUri& url) {
  229. if (&url != this) {
  230. CopyData(url);
  231. CopyImpl(url);
  232. }
  233. }
  234. void Clear() {
  235. Dealloc();
  236. ClearImpl();
  237. }
  238. ui32 GetFieldMask() const {
  239. return FieldsSet;
  240. }
  241. ui32 GetUrlFieldMask() const {
  242. return GetFieldMask() & FlagUrlFields;
  243. }
  244. ui32 GetDirtyMask() const {
  245. return FieldsDirty;
  246. }
  247. void CheckMissingFields();
  248. // Process methods
  249. void Rewrite() {
  250. if (FldIsDirty())
  251. RewriteImpl();
  252. }
  253. private:
  254. TState::EParsed AssignImpl(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty);
  255. TState::EParsed ParseImpl(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeEmpty, ECharset enc = CODES_UTF8);
  256. public:
  257. TState::EParsed Assign(const TParser& parser, TScheme::EKind defscheme = SchemeEmpty) {
  258. const TState::EParsed ret = AssignImpl(parser, defscheme);
  259. if (ParsedOK == ret)
  260. Rewrite();
  261. return ret;
  262. }
  263. TState::EParsed ParseUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
  264. const TState::EParsed ret = ParseImpl(url, flags, maxlen, SchemeEmpty, enc);
  265. if (ParsedOK == ret)
  266. Rewrite();
  267. return ret;
  268. }
  269. // parses absolute URIs
  270. // prepends default scheme (unless unknown) if URI has none
  271. TState::EParsed ParseAbsUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, TScheme::EKind defscheme = SchemeUnknown, ECharset enc = CODES_UTF8);
  272. TState::EParsed ParseAbsOrHttpUri(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
  273. return ParseAbsUri(url, flags, maxlen, SchemeHTTP, enc);
  274. }
  275. TState::EParsed Parse(const TStringBuf& url, const TUri& base, const TParseFlags& flags = FeaturesDefault, ui32 maxlen = 0, ECharset enc = CODES_UTF8);
  276. TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault) {
  277. return ParseUri(url, flags);
  278. }
  279. TState::EParsed Parse(const TStringBuf& url, const TParseFlags& flags, const TStringBuf& base_url, ui32 maxlen = 0, ECharset enc = CODES_UTF8);
  280. TState::EParsed ParseAbs(const TStringBuf& url, const TParseFlags& flags = FeaturesDefault, const TStringBuf& base_url = TStringBuf(), ui32 maxlen = 0, ECharset enc = CODES_UTF8) {
  281. const TState::EParsed result = Parse(url, flags, base_url, maxlen, enc);
  282. return ParsedOK != result || IsValidGlobal() ? result : ParsedBadFormat;
  283. }
  284. // correctAbs works with head "/.." portions:
  285. // 1 - reject URL
  286. // 0 - keep portions
  287. // -1 - ignore portions
  288. void Merge(const TUri& base, int correctAbs = -1);
  289. TLinkType Normalize(const TUri& base, const TStringBuf& link, const TStringBuf& codebase = TStringBuf(), ui64 careFlags = FeaturesDefault, ECharset enc = CODES_UTF8);
  290. private:
  291. int PrintFlags(int flags) const {
  292. if (0 == (FlagUrlFields & flags))
  293. flags |= FlagUrlFields;
  294. return flags;
  295. }
  296. protected:
  297. size_t PrintSize(ui32 flags) const;
  298. // Output method, prints to stream
  299. IOutputStream& PrintImpl(IOutputStream& out, int flags) const;
  300. char* PrintImpl(char* str, size_t size, int flags) const {
  301. TMemoryOutput out(str, size);
  302. PrintImpl(out, flags) << '\0';
  303. return str;
  304. }
  305. static bool IsAbsPath(const TStringBuf& path) {
  306. return 1 <= path.length() && path[0] == '/';
  307. }
  308. bool IsAbsPathImpl() const {
  309. return IsAbsPath(GetField(FieldPath));
  310. }
  311. public:
  312. // Output method, prints to stream
  313. IOutputStream& Print(IOutputStream& out, int flags = FlagUrlFields) const {
  314. return PrintImpl(out, PrintFlags(flags));
  315. }
  316. // Output method, print to str, allocate memory if str is NULL
  317. // Should be deprecated
  318. char* Print(char* str, size_t size, int flags = FlagUrlFields) const {
  319. return nullptr == str ? Serialize(flags) : Serialize(str, size, flags);
  320. }
  321. char* Serialize(char* str, size_t size, int flags = FlagUrlFields) const {
  322. Y_ASSERT(str);
  323. flags = PrintFlags(flags);
  324. const size_t printSize = PrintSize(flags) + 1;
  325. return printSize > size ? nullptr : PrintImpl(str, size, flags);
  326. }
  327. char* Serialize(int flags = FlagUrlFields) const {
  328. flags = PrintFlags(flags);
  329. const size_t size = PrintSize(flags) + 1;
  330. return PrintImpl(static_cast<char*>(malloc(size)), size, flags);
  331. }
  332. // Output method to str
  333. void Print(TString& str, int flags = FlagUrlFields) const {
  334. flags = PrintFlags(flags);
  335. str.reserve(str.length() + PrintSize(flags));
  336. TStringOutput out(str);
  337. PrintImpl(out, flags);
  338. }
  339. TString PrintS(int flags = FlagUrlFields) const {
  340. TString str;
  341. Print(str, flags);
  342. return str;
  343. }
  344. // Only non-default scheme and port are printed
  345. char* PrintHost(char* str, size_t size) const {
  346. return Print(str, size, (Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort);
  347. }
  348. TString PrintHostS() const {
  349. return PrintS((Scheme != SchemeHTTP ? FlagScheme : 0) | FlagHostPort);
  350. }
  351. // Info methods
  352. int Compare(const TUri& A, int flags = FlagUrlFields) const;
  353. int CompareField(EField fld, const TUri& url) const;
  354. const TStringBuf& GetField(EField fld) const {
  355. return FldIsValid(fld) && FldIsSet(fld) ? FldGet(fld) : Default<TStringBuf>();
  356. }
  357. ui16 GetPort() const {
  358. return 0 == Port ? DefaultPort : Port;
  359. }
  360. const TStringBuf& GetHost() const {
  361. if (GetFieldMask() & FlagHostAscii)
  362. return FldGet(FieldHostAscii);
  363. if (GetFieldMask() & FlagHost)
  364. return FldGet(FieldHost);
  365. return Default<TStringBuf>();
  366. }
  367. bool UseHostAscii() {
  368. return FldMov(FieldHostAscii, FieldHost);
  369. }
  370. TScheme::EKind GetScheme() const {
  371. return Scheme;
  372. }
  373. const TSchemeInfo& GetSchemeInfo() const {
  374. return TSchemeInfo::Get(Scheme);
  375. }
  376. bool IsNull(ui32 flags = FlagScheme | FlagHost | FlagPath) const {
  377. return !FldSetCmp(flags);
  378. }
  379. bool IsNull(EField fld) const {
  380. return !FldIsSet(fld);
  381. }
  382. bool IsValidAbs() const {
  383. if (IsNull(FlagScheme | FlagHost | FlagPath))
  384. return false;
  385. return IsAbsPathImpl();
  386. }
  387. bool IsValidGlobal() const {
  388. if (IsNull(FlagScheme | FlagHost))
  389. return false;
  390. if (IsNull(FlagPath))
  391. return true;
  392. return IsAbsPathImpl();
  393. }
  394. bool IsRootless() const {
  395. return FldSetCmp(FlagScheme | FlagHost | FlagPath, FlagScheme | FlagPath) && !IsAbsPathImpl();
  396. }
  397. // for RFC 2396 compatibility
  398. bool IsOpaque() const {
  399. return IsRootless();
  400. }
  401. // Inline helpers
  402. TUri& operator=(const TUri& u) {
  403. Copy(u);
  404. return *this;
  405. }
  406. bool operator!() const {
  407. return IsNull();
  408. }
  409. bool Equal(const TUri& A, int flags = FlagUrlFields) const {
  410. return (Compare(A, flags) == 0);
  411. }
  412. bool Less(const TUri& A, int flags = FlagUrlFields) const {
  413. return (Compare(A, flags) < 0);
  414. }
  415. bool operator==(const TUri& A) const {
  416. return Equal(A, FlagNoFrag);
  417. }
  418. bool operator!=(const TUri& A) const {
  419. return !Equal(A, FlagNoFrag);
  420. }
  421. bool operator<(const TUri& A) const {
  422. return Less(A, FlagNoFrag);
  423. }
  424. bool IsSameDocument(const TUri& other) const {
  425. // pre: both *this and 'other' should be normalized to valid abs
  426. Y_ASSERT(IsValidAbs());
  427. return Equal(other, FlagNoFrag);
  428. }
  429. bool IsLocal(const TUri& other) const {
  430. // pre: both *this and 'other' should be normalized to valid abs
  431. Y_ASSERT(IsValidAbs() && other.IsValidAbs());
  432. return Equal(other, FlagScheme | FlagHostPort);
  433. }
  434. TLinkType Locality(const TUri& other) const {
  435. if (IsSameDocument(other))
  436. return LinkIsFragment;
  437. else if (IsLocal(other))
  438. return LinkIsLocal;
  439. return LinkIsGlobal;
  440. }
  441. static IOutputStream& ReEncodeField(IOutputStream& out, const TStringBuf& val, EField fld, ui64 flags = FeaturesEncodeDecode) {
  442. return NEncode::TEncoder::ReEncode(out, val, NEncode::TEncodeMapper(flags, fld));
  443. }
  444. static IOutputStream& ReEncodeToField(IOutputStream& out, const TStringBuf& val, EField srcfld, ui64 srcflags, EField dstfld, ui64 dstflags) {
  445. return NEncode::TEncoder::ReEncodeTo(out, val, NEncode::TEncodeMapper(srcflags, srcfld), NEncode::TEncodeToMapper(dstflags, dstfld));
  446. }
  447. static IOutputStream& ReEncode(IOutputStream& out, const TStringBuf& val, ui64 flags = FeaturesEncodeDecode) {
  448. return ReEncodeField(out, val, FieldAllMAX, flags);
  449. }
  450. static int PathOperationFlag(const TParseFlags& flags) {
  451. return flags & FeaturePathDenyRootParent ? 1
  452. : flags & FeaturePathStripRootParent ? -1 : 0;
  453. }
  454. static bool PathOperation(char*& pathBeg, char*& pathEnd, int correctAbs);
  455. private:
  456. const TSchemeInfo& SetSchemeImpl(const TSchemeInfo& info) {
  457. Scheme = info.Kind;
  458. DefaultPort = info.Port;
  459. if (!info.Str.empty())
  460. FldSetNoDirty(FieldScheme, info.Str);
  461. return info;
  462. }
  463. const TSchemeInfo& SetSchemeImpl(TScheme::EKind scheme) {
  464. return SetSchemeImpl(TSchemeInfo::Get(scheme));
  465. }
  466. public:
  467. const TSchemeInfo& SetScheme(const TSchemeInfo& info) {
  468. SetSchemeImpl(info);
  469. if (!info.Str.empty())
  470. FldMarkClean(FieldScheme);
  471. return info;
  472. }
  473. const TSchemeInfo& SetScheme(TScheme::EKind scheme) {
  474. return SetScheme(TSchemeInfo::Get(scheme));
  475. }
  476. };
  477. class TUriUpdate {
  478. TUri& Uri_;
  479. public:
  480. TUriUpdate(TUri& uri)
  481. : Uri_(uri)
  482. {
  483. }
  484. ~TUriUpdate() {
  485. Uri_.Rewrite();
  486. }
  487. public:
  488. bool Set(TField::EField field, const TStringBuf& value) {
  489. return Uri_.FldMemSet(field, value);
  490. }
  491. template <size_t size>
  492. bool Set(TField::EField field, const char (&value)[size]) {
  493. return Uri_.FldMemSet(field, value);
  494. }
  495. void Clr(TField::EField field) {
  496. Uri_.FldClr(field);
  497. }
  498. };
  499. const char* LinkTypeToString(const TUri::TLinkType& t);
  500. }
  501. Y_DECLARE_OUT_SPEC(inline, NUri::TUri, out, url) {
  502. url.Print(out);
  503. }
  504. Y_DECLARE_OUT_SPEC(inline, NUri::TUri::TLinkType, out, t) {
  505. out << NUri::LinkTypeToString(t);
  506. }