common.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. #pragma once
  2. #include <util/stream/output.h>
  3. #include <util/system/compat.h>
  4. #include <util/generic/strbuf.h>
  5. namespace NUri {
  6. namespace NEncode {
  7. class TEncoder;
  8. class TEncodeMapperBase;
  9. struct TCharFlags;
  10. }
  11. namespace NParse {
  12. class TRange;
  13. }
  14. class TParser;
  15. struct TField {
  16. #define FIELD_NAME(f) Field##f
  17. #define FIELD_FLAG(f) Flag##f = 1U << FIELD_NAME(f)
  18. enum EField {
  19. FIELD_NAME(Scheme),
  20. FIELD_NAME(User),
  21. FIELD_NAME(Pass),
  22. FIELD_NAME(Host),
  23. FIELD_NAME(Port),
  24. FIELD_NAME(Path),
  25. FIELD_NAME(Query),
  26. FIELD_NAME(Frag),
  27. FIELD_NAME(HashBang),
  28. // add fields above
  29. FieldUrlMAX,
  30. // reset count so actual field offsets are not interrupted
  31. FieldUrlLast = FieldUrlMAX - 1,
  32. // add extra fields below
  33. FIELD_NAME(HostAscii),
  34. // add extra fields above
  35. FieldAllMAX,
  36. // add aliases below
  37. FieldUsername = FieldUser,
  38. FieldPassword = FieldPass,
  39. FieldFragment = FieldFrag,
  40. };
  41. enum EFlags {
  42. FIELD_FLAG(Scheme),
  43. FIELD_FLAG(User),
  44. FIELD_FLAG(Pass),
  45. FIELD_FLAG(Host),
  46. FIELD_FLAG(Port),
  47. FIELD_FLAG(Path),
  48. FIELD_FLAG(Query),
  49. FIELD_FLAG(Frag),
  50. FIELD_FLAG(HashBang),
  51. FIELD_FLAG(UrlMAX),
  52. FIELD_FLAG(HostAscii),
  53. FIELD_FLAG(AllMAX),
  54. FlagHostPort = FlagHost | FlagPort,
  55. FlagAuth = FlagUser | FlagPass,
  56. FlagFragment = FlagFrag,
  57. FlagAction = FlagScheme | FlagHostPort | FlagPath,
  58. FlagNoFrag = FlagAction | FlagQuery | FlagHashBang,
  59. FlagUrlFields = FlagUrlMAX - 1,
  60. FlagAll = FlagUrlFields, // obsolete, for backwards compatibility
  61. FlagAllFields = FlagAllMAX - 1
  62. };
  63. #undef FIELD_NAME
  64. #undef FIELD_FLAG
  65. };
  66. struct TState {
  67. enum EParsed {
  68. ParsedOK = 0,
  69. ParsedEmpty = 1,
  70. ParsedOpaque = 2,
  71. ParsedRootless = ParsedOpaque,
  72. ParsedBadFormat, // must follow all non-error states immediately
  73. ParsedBadPath,
  74. ParsedTooLong,
  75. ParsedBadPort,
  76. ParsedBadAuth,
  77. ParsedBadScheme,
  78. ParsedBadHost,
  79. // add before this line
  80. ParsedMAX
  81. };
  82. };
  83. struct TScheme {
  84. // don't forget to define a SchemeRegistry entry
  85. enum EKind {
  86. SchemeEmpty
  87. // add schemes below this line
  88. ,
  89. SchemeHTTP,
  90. SchemeHTTPS,
  91. SchemeFTP,
  92. SchemeFILE,
  93. SchemeWS,
  94. SchemeWSS
  95. // add schemes above this line
  96. ,
  97. SchemeUnknown
  98. };
  99. };
  100. class TFeature {
  101. friend class NEncode::TEncoder;
  102. friend class NEncode::TEncodeMapperBase;
  103. friend struct NEncode::TCharFlags;
  104. friend class TParser;
  105. friend class NParse::TRange;
  106. #define FEATURE_NAME(f) _BitFeature##f
  107. #define FEATURE_FLAG_NAME(f) Feature##f
  108. #define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1ULL << FEATURE_NAME(f)
  109. protected:
  110. enum EBit {
  111. //==============================
  112. // Cases interpreted as errors:
  113. //==============================
  114. // allows authorization user/password in URL
  115. FEATURE_NAME(AuthSupported),
  116. // allows all known schemes in URL
  117. FEATURE_NAME(SchemeKnown),
  118. // allows all schemes, not only known
  119. FEATURE_NAME(SchemeFlexible),
  120. // allow opaque (RFC 2396) or rootless (RFC 3986) urls
  121. FEATURE_NAME(AllowRootless),
  122. //==============================
  123. // Cases interpreted for processing (if required):
  124. // (effects on result of Parse method)
  125. //==============================
  126. // path needs normalization
  127. // (simplification of directory tree: /../, /./, etc.
  128. FEATURE_NAME(PathOperation),
  129. // don't force empty path to "/"
  130. FEATURE_NAME(AllowEmptyPath),
  131. // in scheme and host segments:
  132. // change upper case letters onto lower case ones
  133. FEATURE_NAME(ToLower),
  134. // decode unreserved symbols
  135. FEATURE_NAME(DecodeUnreserved),
  136. // legacy: decode standard symbols which may be safe for some fields
  137. FEATURE_NAME(DecodeStandardExtra),
  138. // decode symbols allowed (not necessarily safe to decode) only for a given field
  139. // (do not use directly, instead use FeatureDecodeSafe mask below)
  140. FEATURE_NAME(DecodeFieldAllowed),
  141. // handling of spaces
  142. FEATURE_NAME(EncodeSpace),
  143. // in query segment: change escaped space to '+'
  144. FEATURE_NAME(EncodeSpaceAsPlus),
  145. // escape all string 'markup' symbols
  146. FEATURE_NAME(EncodeForSQL),
  147. // encoding of extended ascii symbols (8-bit)
  148. FEATURE_NAME(EncodeExtendedASCII),
  149. // decoding of extended ascii symbols (8-bit)
  150. FEATURE_NAME(DecodeExtendedASCII),
  151. // encoding of extended delimiter set
  152. FEATURE_NAME(EncodeExtendedDelim),
  153. // decoding of extended delimiter set
  154. FEATURE_NAME(DecodeExtendedDelim),
  155. // control characters [0x00 .. 0x20)
  156. FEATURE_NAME(EncodeCntrl),
  157. // raw percent character
  158. FEATURE_NAME(EncodePercent),
  159. // hash fragments
  160. // https://developers.google.com/webmasters/ajax-crawling/docs/specification
  161. // move and encode #! fragments to the query
  162. FEATURE_NAME(HashBangToEscapedFragment),
  163. // move and decode _escaped_fragment_ to the fragment
  164. FEATURE_NAME(EscapedToHashBangFragment),
  165. // reject absolute paths started by "/../"
  166. FEATURE_NAME(PathDenyRootParent),
  167. // paths started by "/../" - ignore head
  168. FEATURE_NAME(PathStripRootParent),
  169. // tries to fix errors (in particular, in fragment)
  170. FEATURE_NAME(TryToFix),
  171. // check host for DNS compliance
  172. FEATURE_NAME(CheckHost),
  173. // allow IDN hosts
  174. // host is converted to punycode and stored in FieldHostAscii
  175. // @note host contains characters in the charset of the document
  176. // and percent-encoded characters in UTF-8 (RFC 3986, 3.2.2)
  177. // @note if host contains no extended-ASCII characters and after
  178. // percent-decoding cannot be converted from UTF-8 to UCS-4,
  179. // try to recode from the document charset (if not UTF-8)
  180. FEATURE_NAME(AllowHostIDN),
  181. // forces AllowHostIDN, but host is replaced with punycode
  182. // forces CheckHost since this replacement is irreversible
  183. FEATURE_NAME(ConvertHostIDN),
  184. // robot interpreted network paths as BadFormat urls
  185. FEATURE_NAME(DenyNetworkPath),
  186. // robot interprets URLs without a host as BadFormat
  187. FEATURE_NAME(RemoteOnly),
  188. /* non-RFC use case:
  189. * 1. do not allow relative-path-only URIs when they can conflict with
  190. * "host/path" (that is, only "./path" or "../path" are allowed);
  191. * 2. if neither scheme nor userinfo are present but port is, it must
  192. * be non-empty, to avoid conflict with "scheme:/...";
  193. * 3. if AllowRootless is not specified, rootless (or opaque) URIs are
  194. * not recognized;
  195. * 4. if AllowRootless is specified, disallow userinfo, preferring
  196. * "scheme:pa@th" over "user:pass@host", and even "host:port" when
  197. * host contains only scheme-legal characters.
  198. */
  199. FEATURE_NAME(NoRelPath),
  200. // standard prefers that all hex escapes were using uppercase A-F
  201. FEATURE_NAME(UpperEncoded),
  202. // internal usage: decode all encoded symbols
  203. FEATURE_NAME(DecodeANY),
  204. // move and encode #! fragment after the query
  205. FEATURE_NAME(FragmentToHashBang),
  206. // add before this line
  207. _FeatureMAX
  208. };
  209. public:
  210. enum EPublic : ui64 {
  211. FeatureMAX = _FeatureMAX,
  212. FEATURE_FLAG(AuthSupported),
  213. FEATURE_FLAG(SchemeKnown),
  214. FEATURE_FLAG(SchemeFlexible),
  215. FEATURE_FLAG(AllowRootless),
  216. FEATURE_FLAG_NAME(AllowOpaque) = FEATURE_FLAG_NAME(AllowRootless),
  217. FEATURE_FLAG(PathOperation),
  218. FEATURE_FLAG(AllowEmptyPath),
  219. FEATURE_FLAG(ToLower),
  220. FEATURE_FLAG(DecodeUnreserved),
  221. FEATURE_FLAG(EncodeSpace),
  222. FEATURE_FLAG(EncodeSpaceAsPlus),
  223. FEATURE_FLAG(EncodeForSQL),
  224. FEATURE_FLAG(EncodeExtendedASCII),
  225. FEATURE_FLAG(DecodeExtendedASCII),
  226. FEATURE_FLAG(EncodeExtendedDelim),
  227. FEATURE_FLAG(DecodeExtendedDelim),
  228. FEATURE_FLAG(EncodeCntrl),
  229. FEATURE_FLAG(EncodePercent),
  230. FEATURE_FLAG(FragmentToHashBang),
  231. FEATURE_FLAG(HashBangToEscapedFragment),
  232. FEATURE_FLAG(EscapedToHashBangFragment),
  233. FEATURE_FLAG(PathDenyRootParent),
  234. FEATURE_FLAG(PathStripRootParent),
  235. FEATURE_FLAG(TryToFix),
  236. FEATURE_FLAG(CheckHost),
  237. FEATURE_FLAG(AllowHostIDN),
  238. FEATURE_FLAG(ConvertHostIDN),
  239. FEATURE_FLAG(DenyNetworkPath),
  240. FEATURE_FLAG(RemoteOnly),
  241. FEATURE_FLAG(NoRelPath),
  242. FEATURE_FLAG_NAME(HierURI) = FEATURE_FLAG_NAME(NoRelPath),
  243. FEATURE_FLAG(UpperEncoded),
  244. FEATURE_FLAG(DecodeANY),
  245. FEATURE_FLAG(DecodeFieldAllowed),
  246. FEATURE_FLAG(DecodeStandardExtra),
  247. };
  248. #undef FEATURE_NAME
  249. #undef FEATURE_FLAG
  250. public:
  251. //==============================
  252. enum ESets : ui64 {
  253. // these are guaranteed and will change buffer size
  254. FeatureDecodeStandard = 0 | FeatureDecodeUnreserved | FeatureDecodeStandardExtra,
  255. FeaturesDecodeExtended = 0 | FeatureDecodeExtendedASCII | FeatureDecodeExtendedDelim,
  256. FeaturesDecode = 0 | FeatureDecodeUnreserved | FeatureDecodeStandard | FeaturesDecodeExtended,
  257. FeaturesEncodeExtended = 0 | FeatureEncodeExtendedASCII | FeatureEncodeExtendedDelim,
  258. FeaturesEncode = 0 | FeatureEncodeForSQL | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent | FeaturesEncodeExtended,
  259. // these are not guaranteed to apply to a given field
  260. FeatureDecodeAllowed = 0 | FeatureDecodeUnreserved | FeatureDecodeFieldAllowed,
  261. FeaturesMaybeDecode = 0 | FeaturesDecode | FeatureDecodeAllowed,
  262. FeaturesMaybeEncode = 0 | FeaturesEncode,
  263. FeaturesEncodeDecode = 0 | FeaturesMaybeEncode | FeaturesMaybeDecode,
  264. FeaturesAllEncoder = 0 | FeaturesEncodeDecode | FeatureDecodeANY | FeatureToLower | FeatureUpperEncoded | FeatureEncodeSpaceAsPlus,
  265. //==============================
  266. FeaturesNormalizeSet = 0 | FeaturePathOperation | FeatureToLower | FeatureDecodeAllowed | FeatureEncodeSpaceAsPlus | FeatureEncodeForSQL | FeaturePathStripRootParent | FeatureTryToFix | FeatureUpperEncoded,
  267. FeaturesDefault = 0 // it reproduces old parsedURL
  268. | FeaturePathOperation | FeaturePathDenyRootParent | FeatureCheckHost,
  269. // essentially allows all valid RFC urls and keeps them as-is
  270. FeaturesBare = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureAllowEmptyPath,
  271. FeaturesAll = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureCheckHost | FeaturesNormalizeSet,
  272. // Deprecated, use FeaturesRecommended
  273. FeaturesRobotOld = 0
  274. // http://tools.ietf.org/html/rfc3986#section-6.2.2
  275. | FeatureToLower // 6.2.2.1
  276. | FeatureUpperEncoded // 6.2.2.1
  277. | FeatureDecodeUnreserved // 6.2.2.2
  278. | FeaturePathOperation // 6.2.2.3
  279. | FeaturePathDenyRootParent | FeatureSchemeKnown | FeatureConvertHostIDN | FeatureRemoteOnly | FeatureHashBangToEscapedFragment | FeatureCheckHost,
  280. // these are mutually exclusive
  281. FeaturesPath = 0 | FeaturePathDenyRootParent | FeaturePathStripRootParent,
  282. FeaturesEscapedFragment = 0 | FeatureEscapedToHashBangFragment | FeatureHashBangToEscapedFragment,
  283. FeaturesCheckSpecialChar = 0 | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent,
  284. FeaturesEncodePChar = 0 | FeatureUpperEncoded | FeaturesEncodeDecode | FeaturesCheckSpecialChar,
  285. // http://wiki.yandex-team.ru/robot/newDesign/dups/normolization
  286. // FeaturesRecommended is deprecated, use NewFeaturesRecommended: ROBOTQUALITY-718
  287. FeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureHashBangToEscapedFragment | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent,
  288. NewFeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureFragmentToHashBang | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent,
  289. // FeaturesRobot is deprecated, use NewFeaturesRecommended: ROBOTQUALITY-718
  290. FeaturesRobot = FeaturesRecommended
  291. };
  292. };
  293. static inline int strnicmp(const char* lt, const char* rt, size_t len) {
  294. return lt == rt ? 0 : ::strnicmp(lt, rt, len);
  295. }
  296. static inline int CompareNoCasePrefix(const TStringBuf& lt, const TStringBuf& rt) {
  297. return strnicmp(lt.data(), rt.data(), rt.length());
  298. }
  299. static inline bool EqualNoCase(const TStringBuf& lt, const TStringBuf& rt) {
  300. return lt.length() == rt.length() && 0 == CompareNoCasePrefix(lt, rt);
  301. }
  302. static inline int CompareNoCase(const TStringBuf& lt, const TStringBuf& rt) {
  303. if (lt.length() == rt.length())
  304. return CompareNoCasePrefix(lt, rt);
  305. return lt.length() < rt.length() ? -1 : 1;
  306. }
  307. class TSchemeInfo {
  308. public:
  309. const TScheme::EKind Kind;
  310. const ui16 Port;
  311. const TStringBuf Str;
  312. const ui32 FldReq;
  313. TSchemeInfo(TScheme::EKind kind, TStringBuf str, ui32 fldReq = 0, ui16 port = 0)
  314. : Kind(kind)
  315. , Port(port)
  316. , Str(str)
  317. , FldReq(fldReq)
  318. {
  319. }
  320. bool Matches(const TStringBuf& scheme) const {
  321. return EqualNoCase(scheme, Str);
  322. }
  323. public:
  324. static const TSchemeInfo& Get(const TStringBuf& scheme);
  325. static const TSchemeInfo& Get(TScheme::EKind scheme) {
  326. return Registry[scheme];
  327. }
  328. static TScheme::EKind GetKind(const TStringBuf& scheme) {
  329. return Get(scheme).Kind;
  330. }
  331. static TStringBuf GetCanon(TScheme::EKind scheme) {
  332. return Get(scheme).Str;
  333. }
  334. static ui16 GetDefaultPort(TScheme::EKind scheme) {
  335. return Get(scheme).Port;
  336. }
  337. private:
  338. static const TSchemeInfo Registry[];
  339. };
  340. struct TParseFlags {
  341. const ui64 Allow;
  342. const ui64 Extra;
  343. TParseFlags(ui64 allow = 0, ui64 extra = 0)
  344. : Allow(allow)
  345. , Extra(extra)
  346. {
  347. }
  348. ui64 operator&(const TParseFlags& flags) const {
  349. return (Allow & flags.Allow) | (Extra & flags.Extra);
  350. }
  351. ui64 operator&(ui64 flags) const {
  352. return (Allow & flags);
  353. }
  354. TParseFlags operator|(const TParseFlags& flags) const {
  355. return TParseFlags(Allow | flags.Allow, Extra | flags.Extra);
  356. }
  357. TParseFlags Exclude(ui64 flags) const {
  358. return TParseFlags(Allow & ~flags, Extra & ~flags);
  359. }
  360. };
  361. #define FEATURE_NAME(f) _BitFeature##f
  362. #define FEATURE_FLAG_NAME(f) Feature##f
  363. #define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1ULL << FEATURE_NAME(f)
  364. struct TQueryArg {
  365. TStringBuf Name;
  366. TStringBuf Value;
  367. private:
  368. enum EBit {
  369. FEATURE_NAME(Filter),
  370. FEATURE_NAME(SortByName),
  371. FEATURE_NAME(RemoveEmptyQuery),
  372. FEATURE_NAME(RewriteDirty),
  373. _FeatureMAX
  374. };
  375. public:
  376. enum EPublic : ui32 {
  377. FeatureMAX = _FeatureMAX,
  378. FEATURE_FLAG(Filter),
  379. FEATURE_FLAG(SortByName),
  380. FEATURE_FLAG(RemoveEmptyQuery),
  381. FEATURE_FLAG(RewriteDirty),
  382. };
  383. enum EProcessed {
  384. // OK and clean.
  385. ProcessedOK = 0,
  386. // OK, but query stored in internal buffer and TUri::Rewrite() is required.
  387. ProcessedDirty = 1,
  388. ProcessedMalformed = 2,
  389. };
  390. };
  391. typedef bool (*TQueryArgFilter)(const TQueryArg& arg, void* filterData);
  392. #undef FEATURE_NAME
  393. #undef FEATURE_FLAG_NAME
  394. #undef FEATURE_FLAG
  395. const char* FieldToString(const TField::EField& t);
  396. const char* ParsedStateToString(const TState::EParsed& t);
  397. const char* SchemeKindToString(const TScheme::EKind& t);
  398. }
  399. Y_DECLARE_OUT_SPEC(inline, NUri::TField::EField, out, t) {
  400. out << NUri::FieldToString(t);
  401. }
  402. Y_DECLARE_OUT_SPEC(inline, NUri::TScheme::EKind, out, t) {
  403. out << NUri::SchemeKindToString(t);
  404. }
  405. Y_DECLARE_OUT_SPEC(inline, NUri::TState::EParsed, out, t) {
  406. out << NUri::ParsedStateToString(t);
  407. }
  408. static inline ui16 DefaultPort(NUri::TScheme::EKind scheme) {
  409. return NUri::TSchemeInfo::GetDefaultPort(scheme);
  410. }
  411. static inline NUri::TScheme::EKind SchemeKind(const TStringBuf& scheme) {
  412. return NUri::TSchemeInfo::GetKind(scheme);
  413. }