#include #ifdef __clang__ #pragma clang diagnostic ignored "-Wunused-variable" #endif %%{ machine TParser; #================================================ # RFC 3986 http://tools.ietf.org/html/rfc3986 # with some modifications #================================================ # The RegEx # # http://www.ics.uci.edu/pub/ietf/uri/#Related # ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? # 12 3 4 5 6 7 8 9 #results in the following subexpression matches: # $1 = http: # $2 = http # $3 = //www.ics.uci.edu # $4 = www.ics.uci.edu # $5 = /pub/ietf/uri/ # $6 = # $7 = # $8 = #Related # $9 = Related # # So $2:scheme $4:authority $5:path $7:query $9:fragment #================================================ #================================================ # List of all ASCII characters and where they can be used #================================================ # 0-31 x00-1F cntrl ext_cntrl # 32 x20 space ext_space # 33 x21 ! sub_delims # 34 x22 " ext_delims # 35 x23 # gen_delims / f=frag # 36 x24 $ sub_delims # 37 x25 % PCT # 38 x26 & sub_delims # 39 x27 ' sub_delims # 40 x28 ( sub_delims # 41 x29 ) sub_delims # 42 x2A * sub_delims # 43 x2B + sub_delims # 44 x2C , sub_delims # 45 x2D - unreserved # 46 x2E . unreserved # 47 x2F / gen_delims / f=path,qry,frag # 48-57 x30-39 0-9 unreserved # 58 x3A : gen_delims / f=pass,path,qry,frag # 59 x3B ; sub_delims # 60 x3C < ext_delims # 61 x3D = sub_delims # 62 x3E > ext_delims # 63 x3F ? gen_delims / f=qry,frag # 64 x40 @ gen_delims / f=path,qry,frag # 65-90 x41-5A A-Z unreserved # 91 x5B [ gen_delims / ext_delims # 92 x5C \ ext_delims # 93 x5D ] gen_delims / ext_delims # 94 x5E ^ ext_delims # 95 x5F _ unreserved # 96 x60 ` ext_delims # 97-122 x61-7A a-z unreserved # 123 x7B { ext_delims # 124 x7C | ext_delims # 125 x7D } ext_delims # 126 x7E ~ unreserved # 127 x7F DEL ext_cntrl # 128-255 x80-FF ext_ascii #================================================ # Actions used in multiple definitions #================================================ action act_req_enc_sql { REQ(fpc, FeatureEncodeForSQL) } # REQ must apply to a char in range but not after the range has been reset action act_req_pathop { REQ(fpc - 1, FeaturePathOperation) } action act_clr_scheme { CLR(fpc, Scheme) } action act_clr_user { CLR(fpc, User) } action act_clr_host { CLR(fpc, Host) } action act_beg_host { BEG(fpc, Host) } action act_end_host { END(fpc, Host) } action act_beg_path { BEG(fpc, Path) } action act_end_path { END(fpc, Path) } #================================================ # RFC 3986 ABNFs #================================================ DIGIT = digit; ALPHA = ( upper >{ REQ(fpc, FeatureToLower) } ) | lower; ALNUM = ALPHA | DIGIT; PCT = "%" >{ PctBeg(fpc); } ; HEXDIG = ( DIGIT >{ HexDigit(fpc, fc); } | [A-F] >{ HexUpper(fpc, fc); } | [a-f] >{ HexLower(fpc, fc); } ); # HexSet sets REQ so must apply in range HEXNUM = ( HEXDIG HEXDIG ) %{ HexSet(fpc - 1); }; pct_encoded = PCT HEXNUM; unreserved = ALNUM | "-" | "." | "_" | "~"; gen_delims = ":" | "/" | "?" | "#" | "[" | "]" | "@"; sub_delims = "!" | "$" | "&" | "(" | ")" | "*" | "+" | "," | ";" | "=" | ( ['] >act_req_enc_sql ); #================================================ # Local ABNFs #================================================ VALID = ^(cntrl | space) | " "; # safe character sequences safe = unreserved | pct_encoded | sub_delims; # MOD: Yandex extensions ext_ascii = (VALID - ascii) >{ REQ(fpc, FeatureEncodeExtendedASCII) }; ext_delims = ( "[" | "]" | "|" | "{" | "}" | "`" | "^" | "<" | ">" | ( ["\\] >act_req_enc_sql ) ) >{ REQ(fpc, FeatureEncodeExtendedDelim) }; # " fix hilite ext_space = " " >{ REQ(fpc, FeatureEncodeSpace) }; ext_cntrl = cntrl >{ REQ(fpc, FeatureEncodeCntrl) }; pct_maybe_encoded = PCT (HEXDIG | HEXNUM)? ; ext_safe = unreserved | pct_maybe_encoded | sub_delims | ext_delims | ext_space | ext_cntrl | ext_ascii; # pchar = unreserved / pct-encoded / sub-delims / ":" / "@" # uric (RFC 2396) # MOD: extension to format, add extended delimiters and 8-bit ascii pchar_nc = ext_safe | "@"; pchar = pchar_nc | ":"; path_sep = "/"; uric = pchar | path_sep | "?"; #================================================ # Fields #================================================ # Single fields use fXXX as machine definitions #================================================ # Scheme # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) #================================================ scheme = ( ALPHA ( ALPHA | DIGIT | "+" | "-" | "." )** ); fscheme = scheme >{ BEG(fpc, Scheme) } %{ END(fpc, Scheme) }; #================================================ # UserInfo # userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) #================================================ # MOD: split into a pair of sections: username and password fuser = ( ext_safe )** >{ BEG(fpc, User) } %{ END(fpc, User) }; fpass = ( ext_safe | ":" )** >{ BEG(fpc, Pass) } %{ END(fpc, Pass) }; userinfo = ( fuser ( ":" fpass )? ) ( "@" %act_clr_host @^act_clr_user ); #================================================ # Hostname # host = IP-literal / IPv4address / reg-name #================================================ # MOD: simplify IP-literal for now IPv6address = (HEXDIG | ":" | ".")+; IP_literal = "[" IPv6address "]"; # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet # MOD: simplify dec-octet which originally matches only 0-255 dec_octet = DIGIT+; IPv4address = dec_octet "." dec_octet "." dec_octet "." dec_octet; # MOD: non-empty; will use host? # reg-name = *( unreserved / pct-encoded / sub-delims ) ### todo: allow ':' (need to fix grammar to disambiguate port) achar = any - (0x00 .. 0x20) - '/' - '#' - '?' - ':' - '%'; upperhalf = any - (0x00 .. 0x7F); hostname = (((achar | pct_encoded)+) & (any* (alnum | upperhalf) any*)); reg_name = hostname - IPv4address - IP_literal; # uses first-match-wins approach host = IP_literal | IPv4address | (reg_name - IPv4address); fhost = host? >act_beg_host %act_end_host; fhost_nempty = host >act_beg_host %act_end_host; #================================================ # Port # port = *DIGIT #================================================ # MOD: use fport? for empty fport = DIGIT+ >{ BEG(fpc, Port) } %{ END(fpc, Port) }; #================================================ # Authority # authority = [ userinfo "@" ] host [ ":" port ] #================================================ authority = userinfo? fhost ( ":" fport? )? ; #================================================ # Path #================================================ # path = path-abempty ; begins with "/" or is empty # / path-absolute ; begins with "/" but not "//" # / path-noscheme ; begins with a non-colon segment # / path-rootless ; begins with a segment # / path-empty ; zero characters #================================================ # checkPath rules checkPathHead = "." ( "."? path_sep VALID* )? %act_req_pathop ; checkPathTail = VALID* ( path_sep "."{1,2} ) %act_req_pathop ; checkPathMid = VALID* ( path_sep "."{,2} path_sep ) %act_req_pathop VALID*; checkAbsPath = checkPathMid | checkPathTail | VALID*; checkRelPath = checkPathHead | checkAbsPath; # segment = *pchar segment = pchar**; # segment-nz = 1*pchar segment_nz = pchar+; # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) segment_nz_nc = pchar_nc+; sep_segment = path_sep segment; # non-standard definitions fpath_abnempty = ( ( sep_segment+ ) & checkAbsPath ) >act_beg_path %act_end_path ; fpath_relative = ( "." ( "."? sep_segment+ )? ) >act_beg_path %act_req_pathop %act_end_path ; # standard definitions # do not save empty paths, they behave differently in relative resolutions fpath_empty = zlen; fpath_abempty = fpath_abnempty?; fpath_absolute = ( ( path_sep ( segment_nz sep_segment* )? ) & checkAbsPath ) >act_beg_path %act_end_path ; fpath_noscheme = ( ( segment_nz_nc sep_segment* ) & checkRelPath ) >act_beg_path %act_end_path ; fpath_rootless = ( ( segment_nz sep_segment* ) ) >act_beg_path %act_end_path ; #================================================ # Query and fragment # query = *( pchar / "/" / "?" ) # fragment = *( pchar / "/" / "?" ) #================================================ # MOD: fragment allows '#' characters fquery = (uric )** >{ BEG(fpc, Query) } %{ END(fpc, Query) }; ffrag = (uric | "#")** >{ BEG(fpc, Frag) } %{ END(fpc, Frag) }; query_frag = ("?" fquery)? ("#" ffrag)? ; #================================================ # final ABNFs # URI-reference = URI / relative-ref #================================================ # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] # hier-part = "//" authority path-abempty # / path-absolute # / path-rootless # / path-empty # relative-ref = relative-part [ "?" query ] [ "#" fragment ] # relative-part = "//" authority path-abempty # / path-absolute # / path-noscheme # / path-empty net_path = "//" authority fpath_abempty; URI = fscheme ":" ( net_path | fpath_absolute | fpath_rootless | fpath_empty ) $^act_clr_scheme query_frag ; relative_ref = ( net_path | fpath_absolute | fpath_noscheme | fpath_empty ) %act_clr_scheme query_frag ; # non-standard definitions URI_no_rootless = fscheme ":" ( net_path | fpath_absolute | fpath_empty ) $^act_clr_scheme query_frag ; host_path = ( fhost_nempty fpath_abempty | (fhost_nempty - scheme) ":" fport fpath_abempty ) @^act_clr_host ; # no userinfo, path absolute, empty or clearly relative, starting with "./" | "../" relative_ref_host_pabem = ( net_path | host_path | fpath_absolute | fpath_relative | fpath_empty ) %act_clr_scheme query_frag ; # port must be non-empty, to avoid clash with "scheme:/..." auth_path = ( fhost_nempty ( ":" fport )? fpath_abempty | userinfo fhost ( ":" fport? )? fpath_abempty ) @^act_clr_host @^act_clr_user ; # userinfo, path absolute, empty or clearly relative, starting with "./" | "../" relative_ref_auth_pabem = ( net_path | auth_path | fpath_absolute | fpath_relative | fpath_empty ) %act_clr_scheme query_frag ; # machine instantiations URI_ref_no_rootless := ( URI_no_rootless # scheme://user@host preferred over user://pass@host/path | relative_ref_auth_pabem ) ; URI_ref_no_relpath := ( relative_ref_host_pabem # host:port/path preferred over scheme:path/rootless | (URI - relative_ref_host_pabem) ) ; URI_ref := ( relative_ref | URI ) ; write data; }%% namespace NUri { bool TParser::doParse(const char* str_beg, size_t length) { const char* p = str_beg; const char* pe = str_beg + length; const char* eof = pe; int cs; #define BEG(ptr, fld) startSection (ptr, TField::Field ## fld); #define END(ptr, fld) finishSection(ptr, TField::Field ## fld); #define SET(val, fld) storeSection(val, TField::Field ## fld); #define CLR(ptr, fld) ResetSection (TField::Field ## fld, ptr); #define REQ(ptr, req) setRequirement(ptr, TFeature :: req); %% write init nocs; if (0 == (Flags & TFeature::FeatureNoRelPath)) { cs = TParser_en_URI_ref; } else if (0 == (Flags & TFeature::FeatureAllowRootless)) { cs = TParser_en_URI_ref_no_rootless; } else { cs = TParser_en_URI_ref_no_relpath; } %% write exec; #undef BEG #undef END #undef SET #undef CLR #undef REQ return cs >= TParser_first_final; } }