parsefsm.rl6 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. #include <library/cpp/uri/parse.h>
  2. #ifdef __clang__
  3. #pragma clang diagnostic ignored "-Wunused-variable"
  4. #endif
  5. %%{
  6. machine TParser;
  7. #================================================
  8. # RFC 3986 http://tools.ietf.org/html/rfc3986
  9. # with some modifications
  10. #================================================
  11. # The RegEx
  12. #
  13. # http://www.ics.uci.edu/pub/ietf/uri/#Related
  14. # ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  15. # 12 3 4 5 6 7 8 9
  16. #results in the following subexpression matches:
  17. # $1 = http:
  18. # $2 = http
  19. # $3 = //www.ics.uci.edu
  20. # $4 = www.ics.uci.edu
  21. # $5 = /pub/ietf/uri/
  22. # $6 = <undefined>
  23. # $7 = <undefined>
  24. # $8 = #Related
  25. # $9 = Related
  26. #
  27. # So $2:scheme $4:authority $5:path $7:query $9:fragment
  28. #================================================
  29. #================================================
  30. # List of all ASCII characters and where they can be used
  31. #================================================
  32. # 0-31 x00-1F cntrl ext_cntrl
  33. # 32 x20 space ext_space
  34. # 33 x21 ! sub_delims
  35. # 34 x22 " ext_delims
  36. # 35 x23 # gen_delims / f=frag
  37. # 36 x24 $ sub_delims
  38. # 37 x25 % PCT
  39. # 38 x26 & sub_delims
  40. # 39 x27 ' sub_delims
  41. # 40 x28 ( sub_delims
  42. # 41 x29 ) sub_delims
  43. # 42 x2A * sub_delims
  44. # 43 x2B + sub_delims
  45. # 44 x2C , sub_delims
  46. # 45 x2D - unreserved
  47. # 46 x2E . unreserved
  48. # 47 x2F / gen_delims / f=path,qry,frag
  49. # 48-57 x30-39 0-9 unreserved
  50. # 58 x3A : gen_delims / f=pass,path,qry,frag
  51. # 59 x3B ; sub_delims
  52. # 60 x3C < ext_delims
  53. # 61 x3D = sub_delims
  54. # 62 x3E > ext_delims
  55. # 63 x3F ? gen_delims / f=qry,frag
  56. # 64 x40 @ gen_delims / f=path,qry,frag
  57. # 65-90 x41-5A A-Z unreserved
  58. # 91 x5B [ gen_delims / ext_delims
  59. # 92 x5C \ ext_delims
  60. # 93 x5D ] gen_delims / ext_delims
  61. # 94 x5E ^ ext_delims
  62. # 95 x5F _ unreserved
  63. # 96 x60 ` ext_delims
  64. # 97-122 x61-7A a-z unreserved
  65. # 123 x7B { ext_delims
  66. # 124 x7C | ext_delims
  67. # 125 x7D } ext_delims
  68. # 126 x7E ~ unreserved
  69. # 127 x7F DEL ext_cntrl
  70. # 128-255 x80-FF ext_ascii
  71. #================================================
  72. # Actions used in multiple definitions
  73. #================================================
  74. action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) }
  75. # REQ must apply to a char in range but not after the range has been reset
  76. action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) }
  77. action act_clr_scheme { CLR(fpc, Scheme) }
  78. action act_clr_user { CLR(fpc, User) }
  79. action act_clr_host { CLR(fpc, Host) }
  80. action act_beg_host { BEG(fpc, Host) }
  81. action act_end_host { END(fpc, Host) }
  82. action act_beg_path { BEG(fpc, Path) }
  83. action act_end_path { END(fpc, Path) }
  84. #================================================
  85. # RFC 3986 ABNFs
  86. #================================================
  87. DIGIT = digit;
  88. ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) |
  89. lower;
  90. ALNUM = ALPHA | DIGIT;
  91. PCT = "%" >{ PctBeg(fpc); } ;
  92. HEXDIG = (
  93. DIGIT >{ HexDigit(fpc, fc); }
  94. | [A-F] >{ HexUpper(fpc, fc); }
  95. | [a-f] >{ HexLower(fpc, fc); }
  96. );
  97. # HexSet sets REQ so must apply in range
  98. HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); };
  99. pct_encoded = PCT HEXNUM;
  100. unreserved = ALNUM | "-" | "." | "_" | "~";
  101. gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@";
  102. sub_delims = "!" | "$" | "&" | "(" | ")"
  103. | "*" | "+" | "," | ";" | "="
  104. | ( ['] >act_req_enc_sql );
  105. #================================================
  106. # Local ABNFs
  107. #================================================
  108. VALID = ^(cntrl | space) | " ";
  109. # safe character sequences
  110. safe = unreserved | pct_encoded | sub_delims;
  111. # MOD: Yandex extensions
  112. ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) };
  113. ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">"
  114. | ( ["\\] >act_req_enc_sql )
  115. ) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite
  116. ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) };
  117. ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) };
  118. pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ;
  119. ext_safe = unreserved
  120. | pct_maybe_encoded
  121. | sub_delims
  122. | ext_delims
  123. | ext_space
  124. | ext_cntrl
  125. | ext_ascii;
  126. # pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
  127. # uric (RFC 2396)
  128. # MOD: extension to format, add extended delimiters and 8-bit ascii
  129. pchar_nc = ext_safe | "@";
  130. pchar = pchar_nc | ":";
  131. path_sep = "/";
  132. uric = pchar | path_sep | "?";
  133. #================================================
  134. # Fields
  135. #================================================
  136. # Single fields use fXXX as machine definitions
  137. #================================================
  138. # Scheme
  139. # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
  140. #================================================
  141. scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** );
  142. fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) };
  143. #================================================
  144. # UserInfo
  145. # userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
  146. #================================================
  147. # MOD: split into a pair of sections: username and password
  148. fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) };
  149. fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) };
  150. userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user );
  151. #================================================
  152. # Hostname
  153. # host = IP-literal / IPv4address / reg-name
  154. #================================================
  155. # MOD: simplify IP-literal for now
  156. IPv6address = (HEXDIG | ":" | ".")+;
  157. IP_literal = "[" IPv6address "]";
  158. # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
  159. # MOD: simplify dec-octet which originally matches only 0-255
  160. dec_octet = DIGIT+;
  161. IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet;
  162. # MOD: non-empty; will use host?
  163. # reg-name = *( unreserved / pct-encoded / sub-delims )
  164. ### todo: allow ':' (need to fix grammar to disambiguate port)
  165. achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%';
  166. upperhalf = any - (0x00 .. 0x7F);
  167. hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*));
  168. reg_name = hostname - IPv4address - IP_literal;
  169. # uses first-match-wins approach
  170. host = IP_literal | IPv4address | (reg_name - IPv4address);
  171. fhost = host? >act_beg_host %act_end_host;
  172. fhost_nempty = host >act_beg_host %act_end_host;
  173. #================================================
  174. # Port
  175. # port = *DIGIT
  176. #================================================
  177. # MOD: use fport? for empty
  178. fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) };
  179. #================================================
  180. # Authority
  181. # authority = [ userinfo "@" ] host [ ":" port ]
  182. #================================================
  183. authority = userinfo? fhost ( ":" fport? )? ;
  184. #================================================
  185. # Path
  186. #================================================
  187. # path = path-abempty ; begins with "/" or is empty
  188. # / path-absolute ; begins with "/" but not "//"
  189. # / path-noscheme ; begins with a non-colon segment
  190. # / path-rootless ; begins with a segment
  191. # / path-empty ; zero characters
  192. #================================================
  193. # checkPath rules
  194. checkPathHead =
  195. "." ( "."? path_sep VALID* )? %act_req_pathop ;
  196. checkPathTail =
  197. VALID*
  198. ( path_sep "."{1,2} ) %act_req_pathop ;
  199. checkPathMid = VALID*
  200. ( path_sep "."{,2} path_sep ) %act_req_pathop
  201. VALID*;
  202. checkAbsPath = checkPathMid | checkPathTail | VALID*;
  203. checkRelPath = checkPathHead | checkAbsPath;
  204. # segment = *pchar
  205. segment = pchar**;
  206. # segment-nz = 1*pchar
  207. segment_nz = pchar+;
  208. # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
  209. segment_nz_nc = pchar_nc+;
  210. sep_segment = path_sep segment;
  211. # non-standard definitions
  212. fpath_abnempty =
  213. (
  214. ( sep_segment+ )
  215. & checkAbsPath
  216. )
  217. >act_beg_path %act_end_path
  218. ;
  219. fpath_relative =
  220. (
  221. "."
  222. ( "."? sep_segment+ )?
  223. )
  224. >act_beg_path %act_req_pathop %act_end_path
  225. ;
  226. # standard definitions
  227. # do not save empty paths, they behave differently in relative resolutions
  228. fpath_empty = zlen;
  229. fpath_abempty = fpath_abnempty?;
  230. fpath_absolute =
  231. (
  232. ( path_sep ( segment_nz sep_segment* )? )
  233. & checkAbsPath
  234. )
  235. >act_beg_path %act_end_path
  236. ;
  237. fpath_noscheme =
  238. (
  239. ( segment_nz_nc sep_segment* )
  240. & checkRelPath
  241. )
  242. >act_beg_path %act_end_path
  243. ;
  244. fpath_rootless =
  245. (
  246. ( segment_nz sep_segment* )
  247. )
  248. >act_beg_path %act_end_path
  249. ;
  250. #================================================
  251. # Query and fragment
  252. # query = *( pchar / "/" / "?" )
  253. # fragment = *( pchar / "/" / "?" )
  254. #================================================
  255. # MOD: fragment allows '#' characters
  256. fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) };
  257. ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) };
  258. query_frag = ("?" fquery)? ("#" ffrag)? ;
  259. #================================================
  260. # final ABNFs
  261. # URI-reference = URI / relative-ref
  262. #================================================
  263. # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  264. # hier-part = "//" authority path-abempty
  265. # / path-absolute
  266. # / path-rootless
  267. # / path-empty
  268. # relative-ref = relative-part [ "?" query ] [ "#" fragment ]
  269. # relative-part = "//" authority path-abempty
  270. # / path-absolute
  271. # / path-noscheme
  272. # / path-empty
  273. net_path = "//" authority fpath_abempty;
  274. URI =
  275. fscheme ":"
  276. (
  277. net_path
  278. | fpath_absolute
  279. | fpath_rootless
  280. | fpath_empty
  281. )
  282. $^act_clr_scheme
  283. query_frag
  284. ;
  285. relative_ref =
  286. (
  287. net_path
  288. | fpath_absolute
  289. | fpath_noscheme
  290. | fpath_empty
  291. )
  292. %act_clr_scheme
  293. query_frag
  294. ;
  295. # non-standard definitions
  296. URI_no_rootless =
  297. fscheme ":"
  298. (
  299. net_path
  300. | fpath_absolute
  301. | fpath_empty
  302. )
  303. $^act_clr_scheme
  304. query_frag
  305. ;
  306. host_path =
  307. (
  308. fhost_nempty fpath_abempty
  309. | (fhost_nempty - scheme) ":" fport fpath_abempty
  310. )
  311. @^act_clr_host
  312. ;
  313. # no userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
  314. relative_ref_host_pabem =
  315. (
  316. net_path
  317. | host_path
  318. | fpath_absolute
  319. | fpath_relative
  320. | fpath_empty
  321. )
  322. %act_clr_scheme
  323. query_frag
  324. ;
  325. # port must be non-empty, to avoid clash with "scheme:/..."
  326. auth_path =
  327. (
  328. fhost_nempty ( ":" fport )? fpath_abempty
  329. | userinfo fhost ( ":" fport? )? fpath_abempty
  330. )
  331. @^act_clr_host
  332. @^act_clr_user
  333. ;
  334. # userinfo, path absolute, empty or clearly relative, starting with "./" | "../"
  335. relative_ref_auth_pabem =
  336. (
  337. net_path
  338. | auth_path
  339. | fpath_absolute
  340. | fpath_relative
  341. | fpath_empty
  342. )
  343. %act_clr_scheme
  344. query_frag
  345. ;
  346. # machine instantiations
  347. URI_ref_no_rootless :=
  348. (
  349. URI_no_rootless
  350. # scheme://user@host preferred over user://pass@host/path
  351. | relative_ref_auth_pabem
  352. )
  353. ;
  354. URI_ref_no_relpath :=
  355. (
  356. relative_ref_host_pabem
  357. # host:port/path preferred over scheme:path/rootless
  358. | (URI - relative_ref_host_pabem)
  359. )
  360. ;
  361. URI_ref :=
  362. (
  363. relative_ref
  364. | URI
  365. )
  366. ;
  367. write data;
  368. }%%
  369. namespace NUri {
  370. bool TParser::doParse(const char* str_beg, size_t length)
  371. {
  372. const char* p = str_beg;
  373. const char* pe = str_beg + length;
  374. const char* eof = pe;
  375. int cs;
  376. #define BEG(ptr, fld) startSection (ptr, TField::Field ## fld);
  377. #define END(ptr, fld) finishSection(ptr, TField::Field ## fld);
  378. #define SET(val, fld) storeSection(val, TField::Field ## fld);
  379. #define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr);
  380. #define REQ(ptr, req) setRequirement(ptr, TFeature :: req);
  381. %% write init nocs;
  382. if (0 == (Flags & TFeature::FeatureNoRelPath)) {
  383. cs = TParser_en_URI_ref;
  384. } else if (0 == (Flags & TFeature::FeatureAllowRootless)) {
  385. cs = TParser_en_URI_ref_no_rootless;
  386. } else {
  387. cs = TParser_en_URI_ref_no_relpath;
  388. }
  389. %% write exec;
  390. #undef BEG
  391. #undef END
  392. #undef SET
  393. #undef CLR
  394. #undef REQ
  395. return cs >= TParser_first_final;
  396. }
  397. }