common.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. #pragma once
  2. #include <util/stream/output.h>
  3. #include <util/system/compat.h>
  4. #include <util/generic/strbuf.h>
  5. namespace NUri {
  6. namespace NEncode {
  7. class TEncoder;
  8. class TEncodeMapperBase;
  9. struct TCharFlags;
  10. }
  11. namespace NParse {
  12. class TRange;
  13. }
  14. class TParser;
  15. struct TField {
  16. #define FIELD_NAME(f) Field##f
  17. #define FIELD_FLAG(f) Flag##f = 1U << FIELD_NAME(f)
  18. enum EField {
  19. FIELD_NAME(Scheme),
  20. FIELD_NAME(User),
  21. FIELD_NAME(Pass),
  22. FIELD_NAME(Host),
  23. FIELD_NAME(Port),
  24. FIELD_NAME(Path),
  25. FIELD_NAME(Query),
  26. FIELD_NAME(Frag),
  27. // add fields above
  28. FieldUrlMAX,
  29. // reset count so actual field offsets are not interrupted
  30. FieldUrlLast = FieldUrlMAX - 1,
  31. // add extra fields below
  32. FIELD_NAME(HostAscii),
  33. // add extra fields above
  34. FieldAllMAX,
  35. // add aliases below
  36. FieldUsername = FieldUser,
  37. FieldPassword = FieldPass,
  38. FieldFragment = FieldFrag,
  39. };
  40. enum EFlags {
  41. FIELD_FLAG(Scheme),
  42. FIELD_FLAG(User),
  43. FIELD_FLAG(Pass),
  44. FIELD_FLAG(Host),
  45. FIELD_FLAG(Port),
  46. FIELD_FLAG(Path),
  47. FIELD_FLAG(Query),
  48. FIELD_FLAG(Frag),
  49. FIELD_FLAG(UrlMAX),
  50. FIELD_FLAG(HostAscii),
  51. FIELD_FLAG(AllMAX),
  52. FlagHostPort = FlagHost | FlagPort,
  53. FlagAuth = FlagUser | FlagPass,
  54. FlagFragment = FlagFrag,
  55. FlagAction = FlagScheme | FlagHostPort | FlagPath,
  56. FlagNoFrag = FlagAction | FlagQuery,
  57. FlagUrlFields = FlagUrlMAX - 1,
  58. FlagAll = FlagUrlFields, // obsolete, for backwards compatibility
  59. FlagAllFields = FlagAllMAX - 1
  60. };
  61. #undef FIELD_NAME
  62. #undef FIELD_FLAG
  63. };
  64. struct TState {
  65. enum EParsed {
  66. ParsedOK = 0,
  67. ParsedEmpty = 1,
  68. ParsedOpaque = 2,
  69. ParsedRootless = ParsedOpaque,
  70. ParsedBadFormat, // must follow all non-error states immediately
  71. ParsedBadPath,
  72. ParsedTooLong,
  73. ParsedBadPort,
  74. ParsedBadAuth,
  75. ParsedBadScheme,
  76. ParsedBadHost,
  77. // add before this line
  78. ParsedMAX
  79. };
  80. };
  81. struct TScheme {
  82. // don't forget to define a SchemeRegistry entry
  83. enum EKind {
  84. SchemeEmpty
  85. // add schemes below this line
  86. ,
  87. SchemeHTTP,
  88. SchemeHTTPS,
  89. SchemeFTP,
  90. SchemeFILE,
  91. SchemeWS,
  92. SchemeWSS
  93. // add schemes above this line
  94. ,
  95. SchemeUnknown
  96. };
  97. };
  98. class TFeature {
  99. friend class NEncode::TEncoder;
  100. friend class NEncode::TEncodeMapperBase;
  101. friend struct NEncode::TCharFlags;
  102. friend class TParser;
  103. friend class NParse::TRange;
  104. #define FEATURE_NAME(f) _BitFeature##f
  105. #define FEATURE_FLAG_NAME(f) Feature##f
  106. #define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f)
  107. protected:
  108. enum EBit {
  109. //==============================
  110. // Cases interpreted as errors:
  111. //==============================
  112. // allows authorization user/password in URL
  113. FEATURE_NAME(AuthSupported),
  114. // allows all known schemes in URL
  115. FEATURE_NAME(SchemeKnown),
  116. // allows all schemes, not only known
  117. FEATURE_NAME(SchemeFlexible),
  118. // allow opaque (RFC 2396) or rootless (RFC 3986) urls
  119. FEATURE_NAME(AllowRootless),
  120. //==============================
  121. // Cases interpreted for processing (if required):
  122. // (effects on result of Parse method)
  123. //==============================
  124. // path needs normalization
  125. // (simplification of directory tree: /../, /./, etc.
  126. FEATURE_NAME(PathOperation),
  127. // don't force empty path to "/"
  128. FEATURE_NAME(AllowEmptyPath),
  129. // in scheme and host segments:
  130. // change upper case letters onto lower case ones
  131. FEATURE_NAME(ToLower),
  132. // decode unreserved symbols
  133. FEATURE_NAME(DecodeUnreserved),
  134. // legacy: decode standard symbols which may be safe for some fields
  135. FEATURE_NAME(DecodeStandardExtra),
  136. // decode symbols allowed (not necessarily safe to decode) only for a given field
  137. // (do not use directly, instead use FeatureDecodeSafe mask below)
  138. FEATURE_NAME(DecodeFieldAllowed),
  139. // handling of spaces
  140. FEATURE_NAME(EncodeSpace),
  141. // in query segment: change escaped space to '+'
  142. FEATURE_NAME(EncodeSpaceAsPlus),
  143. // escape all string 'markup' symbols
  144. FEATURE_NAME(EncodeForSQL),
  145. // encoding of extended ascii symbols (8-bit)
  146. FEATURE_NAME(EncodeExtendedASCII),
  147. // decoding of extended ascii symbols (8-bit)
  148. FEATURE_NAME(DecodeExtendedASCII),
  149. // encoding of extended delimiter set
  150. FEATURE_NAME(EncodeExtendedDelim),
  151. // decoding of extended delimiter set
  152. FEATURE_NAME(DecodeExtendedDelim),
  153. // control characters [0x00 .. 0x20)
  154. FEATURE_NAME(EncodeCntrl),
  155. // raw percent character
  156. FEATURE_NAME(EncodePercent),
  157. // hash fragments
  158. // https://developers.google.com/webmasters/ajax-crawling/docs/specification
  159. // move and encode #! fragments to the query
  160. FEATURE_NAME(HashBangToEscapedFragment),
  161. // move and decode _escaped_fragment_ to the fragment
  162. FEATURE_NAME(EscapedToHashBangFragment),
  163. // reject absolute paths started by "/../"
  164. FEATURE_NAME(PathDenyRootParent),
  165. // paths started by "/../" - ignore head
  166. FEATURE_NAME(PathStripRootParent),
  167. // tries to fix errors (in particular, in fragment)
  168. FEATURE_NAME(TryToFix),
  169. // check host for DNS compliance
  170. FEATURE_NAME(CheckHost),
  171. // allow IDN hosts
  172. // host is converted to punycode and stored in FieldHostAscii
  173. // @note host contains characters in the charset of the document
  174. // and percent-encoded characters in UTF-8 (RFC 3986, 3.2.2)
  175. // @note if host contains no extended-ASCII characters and after
  176. // percent-decoding cannot be converted from UTF-8 to UCS-4,
  177. // try to recode from the document charset (if not UTF-8)
  178. FEATURE_NAME(AllowHostIDN),
  179. // forces AllowHostIDN, but host is replaced with punycode
  180. // forces CheckHost since this replacement is irreversible
  181. FEATURE_NAME(ConvertHostIDN),
  182. // robot interpreted network paths as BadFormat urls
  183. FEATURE_NAME(DenyNetworkPath),
  184. // robot interprets URLs without a host as BadFormat
  185. FEATURE_NAME(RemoteOnly),
  186. /* non-RFC use case:
  187. * 1. do not allow relative-path-only URIs when they can conflict with
  188. * "host/path" (that is, only "./path" or "../path" are allowed);
  189. * 2. if neither scheme nor userinfo are present but port is, it must
  190. * be non-empty, to avoid conflict with "scheme:/...";
  191. * 3. if AllowRootless is not specified, rootless (or opaque) URIs are
  192. * not recognized;
  193. * 4. if AllowRootless is specified, disallow userinfo, preferring
  194. * "scheme:pa@th" over "user:pass@host", and even "host:port" when
  195. * host contains only scheme-legal characters.
  196. */
  197. FEATURE_NAME(NoRelPath),
  198. // standard prefers that all hex escapes were using uppercase A-F
  199. FEATURE_NAME(UpperEncoded),
  200. // internal usage: decode all encoded symbols
  201. FEATURE_NAME(DecodeANY),
  202. // add before this line
  203. _FeatureMAX
  204. };
  205. protected:
  206. enum EPrivate : ui32 {
  207. FEATURE_FLAG(DecodeANY),
  208. FEATURE_FLAG(DecodeFieldAllowed),
  209. FEATURE_FLAG(DecodeStandardExtra),
  210. };
  211. public:
  212. enum EPublic : ui32 {
  213. FeatureMAX = _FeatureMAX,
  214. FEATURE_FLAG(AuthSupported),
  215. FEATURE_FLAG(SchemeKnown),
  216. FEATURE_FLAG(SchemeFlexible),
  217. FEATURE_FLAG(AllowRootless),
  218. FEATURE_FLAG_NAME(AllowOpaque) = FEATURE_FLAG_NAME(AllowRootless),
  219. FEATURE_FLAG(PathOperation),
  220. FEATURE_FLAG(AllowEmptyPath),
  221. FEATURE_FLAG(ToLower),
  222. FEATURE_FLAG(DecodeUnreserved),
  223. FEATURE_FLAG(EncodeSpace),
  224. FEATURE_FLAG(EncodeSpaceAsPlus),
  225. FEATURE_FLAG(EncodeForSQL),
  226. FEATURE_FLAG(EncodeExtendedASCII),
  227. FEATURE_FLAG(DecodeExtendedASCII),
  228. FEATURE_FLAG(EncodeExtendedDelim),
  229. FEATURE_FLAG(DecodeExtendedDelim),
  230. FEATURE_FLAG(EncodeCntrl),
  231. FEATURE_FLAG(EncodePercent),
  232. FEATURE_FLAG(HashBangToEscapedFragment),
  233. FEATURE_FLAG(EscapedToHashBangFragment),
  234. FEATURE_FLAG(PathDenyRootParent),
  235. FEATURE_FLAG(PathStripRootParent),
  236. FEATURE_FLAG(TryToFix),
  237. FEATURE_FLAG(CheckHost),
  238. FEATURE_FLAG(AllowHostIDN),
  239. FEATURE_FLAG(ConvertHostIDN),
  240. FEATURE_FLAG(DenyNetworkPath),
  241. FEATURE_FLAG(RemoteOnly),
  242. FEATURE_FLAG(NoRelPath),
  243. FEATURE_FLAG_NAME(HierURI) = FEATURE_FLAG_NAME(NoRelPath),
  244. FEATURE_FLAG(UpperEncoded),
  245. };
  246. #undef FEATURE_NAME
  247. #undef FEATURE_FLAG
  248. public:
  249. //==============================
  250. enum ESets {
  251. // these are guaranteed and will change buffer size
  252. FeatureDecodeStandard = 0 | FeatureDecodeUnreserved | FeatureDecodeStandardExtra,
  253. FeaturesDecodeExtended = 0 | FeatureDecodeExtendedASCII | FeatureDecodeExtendedDelim,
  254. FeaturesDecode = 0 | FeatureDecodeUnreserved | FeatureDecodeStandard | FeaturesDecodeExtended,
  255. FeaturesEncodeExtended = 0 | FeatureEncodeExtendedASCII | FeatureEncodeExtendedDelim,
  256. FeaturesEncode = 0 | FeatureEncodeForSQL | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent | FeaturesEncodeExtended,
  257. // these are not guaranteed to apply to a given field
  258. FeatureDecodeAllowed = 0 | FeatureDecodeUnreserved | FeatureDecodeFieldAllowed,
  259. FeaturesMaybeDecode = 0 | FeaturesDecode | FeatureDecodeAllowed,
  260. FeaturesMaybeEncode = 0 | FeaturesEncode,
  261. FeaturesEncodeDecode = 0 | FeaturesMaybeEncode | FeaturesMaybeDecode,
  262. FeaturesAllEncoder = 0 | FeaturesEncodeDecode | FeatureDecodeANY | FeatureToLower | FeatureUpperEncoded | FeatureEncodeSpaceAsPlus,
  263. //==============================
  264. FeaturesNormalizeSet = 0 | FeaturePathOperation | FeatureToLower | FeatureDecodeAllowed | FeatureEncodeSpaceAsPlus | FeatureEncodeForSQL | FeaturePathStripRootParent | FeatureTryToFix | FeatureUpperEncoded,
  265. FeaturesDefault = 0 // it reproduces old parsedURL
  266. | FeaturePathOperation | FeaturePathDenyRootParent | FeatureCheckHost,
  267. // essentially allows all valid RFC urls and keeps them as-is
  268. FeaturesBare = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureAllowEmptyPath,
  269. FeaturesAll = 0 | FeatureAuthSupported | FeatureSchemeFlexible | FeatureCheckHost | FeaturesNormalizeSet,
  270. // Deprecated, use FeaturesRecommended
  271. FeaturesRobotOld = 0
  272. // http://tools.ietf.org/html/rfc3986#section-6.2.2
  273. | FeatureToLower // 6.2.2.1
  274. | FeatureUpperEncoded // 6.2.2.1
  275. | FeatureDecodeUnreserved // 6.2.2.2
  276. | FeaturePathOperation // 6.2.2.3
  277. | FeaturePathDenyRootParent | FeatureSchemeKnown | FeatureConvertHostIDN | FeatureRemoteOnly | FeatureHashBangToEscapedFragment | FeatureCheckHost,
  278. // these are mutually exclusive
  279. FeaturesPath = 0 | FeaturePathDenyRootParent | FeaturePathStripRootParent,
  280. FeaturesEscapedFragment = 0 | FeatureEscapedToHashBangFragment | FeatureHashBangToEscapedFragment,
  281. FeaturesCheckSpecialChar = 0 | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodePercent,
  282. FeaturesEncodePChar = 0 | FeatureUpperEncoded | FeaturesEncodeDecode | FeaturesCheckSpecialChar,
  283. // http://wiki.yandex-team.ru/robot/newDesign/dups/normolization
  284. FeaturesRecommended = 0 | FeatureSchemeKnown | FeatureRemoteOnly | FeatureToLower | FeatureCheckHost | FeatureConvertHostIDN | FeatureHashBangToEscapedFragment | FeatureEncodeSpace | FeatureEncodeCntrl | FeatureEncodeExtendedASCII | FeatureUpperEncoded | FeatureDecodeUnreserved | FeaturePathOperation | FeaturePathStripRootParent,
  285. FeaturesRobot = FeaturesRecommended
  286. };
  287. };
  288. static inline int strnicmp(const char* lt, const char* rt, size_t len) {
  289. return lt == rt ? 0 : ::strnicmp(lt, rt, len);
  290. }
  291. static inline int CompareNoCasePrefix(const TStringBuf& lt, const TStringBuf& rt) {
  292. return strnicmp(lt.data(), rt.data(), rt.length());
  293. }
  294. static inline bool EqualNoCase(const TStringBuf& lt, const TStringBuf& rt) {
  295. return lt.length() == rt.length() && 0 == CompareNoCasePrefix(lt, rt);
  296. }
  297. static inline int CompareNoCase(const TStringBuf& lt, const TStringBuf& rt) {
  298. if (lt.length() == rt.length())
  299. return CompareNoCasePrefix(lt, rt);
  300. return lt.length() < rt.length() ? -1 : 1;
  301. }
  302. class TSchemeInfo {
  303. public:
  304. const TScheme::EKind Kind;
  305. const ui16 Port;
  306. const TStringBuf Str;
  307. const ui32 FldReq;
  308. TSchemeInfo(TScheme::EKind kind, TStringBuf str, ui32 fldReq = 0, ui16 port = 0)
  309. : Kind(kind)
  310. , Port(port)
  311. , Str(str)
  312. , FldReq(fldReq)
  313. {
  314. }
  315. bool Matches(const TStringBuf& scheme) const {
  316. return EqualNoCase(scheme, Str);
  317. }
  318. public:
  319. static const TSchemeInfo& Get(const TStringBuf& scheme);
  320. static const TSchemeInfo& Get(TScheme::EKind scheme) {
  321. return Registry[scheme];
  322. }
  323. static TScheme::EKind GetKind(const TStringBuf& scheme) {
  324. return Get(scheme).Kind;
  325. }
  326. static TStringBuf GetCanon(TScheme::EKind scheme) {
  327. return Get(scheme).Str;
  328. }
  329. static ui16 GetDefaultPort(TScheme::EKind scheme) {
  330. return Get(scheme).Port;
  331. }
  332. private:
  333. static const TSchemeInfo Registry[];
  334. };
  335. struct TParseFlags {
  336. const ui64 Allow;
  337. const ui64 Extra;
  338. TParseFlags(ui64 allow = 0, ui64 extra = 0)
  339. : Allow(allow)
  340. , Extra(extra)
  341. {
  342. }
  343. ui64 operator&(const TParseFlags& flags) const {
  344. return (Allow & flags.Allow) | (Extra & flags.Extra);
  345. }
  346. ui64 operator&(ui64 flags) const {
  347. return (Allow & flags);
  348. }
  349. TParseFlags operator|(const TParseFlags& flags) const {
  350. return TParseFlags(Allow | flags.Allow, Extra | flags.Extra);
  351. }
  352. TParseFlags Exclude(ui64 flags) const {
  353. return TParseFlags(Allow & ~flags, Extra & ~flags);
  354. }
  355. };
  356. #define FEATURE_NAME(f) _BitFeature##f
  357. #define FEATURE_FLAG_NAME(f) Feature##f
  358. #define FEATURE_FLAG(f) FEATURE_FLAG_NAME(f) = 1UL << FEATURE_NAME(f)
  359. struct TQueryArg {
  360. TStringBuf Name;
  361. TStringBuf Value;
  362. private:
  363. enum EBit {
  364. FEATURE_NAME(Filter),
  365. FEATURE_NAME(SortByName),
  366. FEATURE_NAME(RemoveEmptyQuery),
  367. FEATURE_NAME(RewriteDirty),
  368. _FeatureMAX
  369. };
  370. public:
  371. enum EPublic : ui32 {
  372. FeatureMAX = _FeatureMAX,
  373. FEATURE_FLAG(Filter),
  374. FEATURE_FLAG(SortByName),
  375. FEATURE_FLAG(RemoveEmptyQuery),
  376. FEATURE_FLAG(RewriteDirty),
  377. };
  378. enum EProcessed {
  379. // OK and clean.
  380. ProcessedOK = 0,
  381. // OK, but query stored in internal buffer and TUri::Rewrite() is required.
  382. ProcessedDirty = 1,
  383. ProcessedMalformed = 2,
  384. ProcessedTooMany = 3,
  385. };
  386. };
  387. typedef bool (*TQueryArgFilter)(const TQueryArg& arg, void* filterData);
  388. #undef FEATURE_NAME
  389. #undef FEATURE_FLAG_NAME
  390. #undef FEATURE_FLAG
  391. const char* FieldToString(const TField::EField& t);
  392. const char* ParsedStateToString(const TState::EParsed& t);
  393. const char* SchemeKindToString(const TScheme::EKind& t);
  394. }
  395. Y_DECLARE_OUT_SPEC(inline, NUri::TField::EField, out, t) {
  396. out << NUri::FieldToString(t);
  397. }
  398. Y_DECLARE_OUT_SPEC(inline, NUri::TScheme::EKind, out, t) {
  399. out << NUri::SchemeKindToString(t);
  400. }
  401. Y_DECLARE_OUT_SPEC(inline, NUri::TState::EParsed, out, t) {
  402. out << NUri::ParsedStateToString(t);
  403. }
  404. static inline ui16 DefaultPort(NUri::TScheme::EKind scheme) {
  405. return NUri::TSchemeInfo::GetDefaultPort(scheme);
  406. }
  407. static inline NUri::TScheme::EKind SchemeKind(const TStringBuf& scheme) {
  408. return NUri::TSchemeInfo::GetKind(scheme);
  409. }