uri.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. #include "uri.h"
  2. #include "parse.h"
  3. #include <util/string/cast.h>
  4. #include <util/string/util.h>
  5. #include <util/system/yassert.h>
  6. namespace NUri {
  7. TState::EParsed TUri::CheckHost(const TStringBuf& host) {
  8. if (host.empty())
  9. return ParsedOK;
  10. unsigned domainLevel = 0;
  11. unsigned domainLevelOfUnderscore = 0;
  12. bool isAlnum = false;
  13. bool startLabel = true;
  14. for (size_t i = 0; i != host.length(); ++i) {
  15. const char ch = host[i];
  16. if ('.' == ch) { // label separator
  17. if (!isAlnum || startLabel) // previous label must end in alnum
  18. return ParsedBadHost;
  19. startLabel = true;
  20. continue;
  21. }
  22. isAlnum = isalnum((const unsigned char)ch);
  23. if (startLabel) { // label is starting
  24. if (!isAlnum && '_' != ch) // new label must start with alnum or '_'
  25. return ParsedBadHost;
  26. startLabel = false;
  27. ++domainLevel;
  28. if (ch == '_')
  29. domainLevelOfUnderscore = domainLevel;
  30. continue;
  31. }
  32. if (isAlnum || '-' == ch)
  33. continue;
  34. if (ch == '_') { // non-standard case we allow for certain hosts
  35. domainLevelOfUnderscore = domainLevel;
  36. continue;
  37. }
  38. return ParsedBadHost;
  39. }
  40. if (0 < domainLevelOfUnderscore && domainLevel < 2 + domainLevelOfUnderscore)
  41. return ParsedBadHost;
  42. return ParsedOK;
  43. }
  44. /********************************************************/
  45. TUri::TUri(const TStringBuf& host, ui16 port, const TStringBuf& path, const TStringBuf& query, const TStringBuf& scheme, unsigned defaultPort, const TStringBuf& hashbang)
  46. : FieldsSet(0)
  47. , Port(port)
  48. , DefaultPort(0)
  49. , Scheme(SchemeEmpty)
  50. , FieldsDirty(0)
  51. {
  52. if (!scheme.empty()) {
  53. if (SetSchemeImpl(TSchemeInfo::Get(scheme)).Str.empty())
  54. FldSet(FieldScheme, scheme);
  55. }
  56. if (0 < defaultPort) // override the scheme's default port
  57. DefaultPort = static_cast<ui16>(defaultPort);
  58. char sport[6]; // enough for ui16
  59. if (0 != port) {
  60. const size_t len = ToString(port, sport, sizeof(sport));
  61. FldSet(FieldPort, TStringBuf(sport, len));
  62. }
  63. FldTrySet(FieldHost, host);
  64. FldTrySet(FieldPath, path);
  65. FldTrySet(FieldQuery, query);
  66. FldTrySet(FieldHashBang, hashbang);
  67. Rewrite();
  68. }
  69. /********************************************************/
  70. bool TUri::FldSetImpl(
  71. EField field, TStringBuf value, bool strconst, bool nocopy) {
  72. if (!FldIsValid(field))
  73. return false;
  74. switch (field) {
  75. case FieldScheme:
  76. if (!SetScheme(TSchemeInfo::Get(value)).Str.empty())
  77. return false;
  78. break;
  79. case FieldPort:
  80. Port = value.empty() ? 0 : FromString<ui16>(value);
  81. break;
  82. default:
  83. break;
  84. }
  85. if (!value.IsInited()) {
  86. FldClr(field);
  87. return false;
  88. }
  89. if (strconst) { // string constants don't need to be saved in the buffer
  90. FldMarkClean(field);
  91. FldSetNoDirty(field, value);
  92. return false;
  93. }
  94. if (nocopy) {
  95. FldSet(field, value);
  96. return true;
  97. }
  98. return FldTryCpy(field, value);
  99. }
  100. /********************************************************/
  101. bool TUri::FldTryCpy(EField field, const TStringBuf& value) {
  102. if (!FldIsDirty(field)) {
  103. do {
  104. if (!FldIsSet(field))
  105. break;
  106. TStringBuf& fld = Fields[field];
  107. if (fld.length() < value.length())
  108. break;
  109. char* oldV = (char*)fld.data();
  110. if (!IsInBuffer(oldV))
  111. break;
  112. memcpy(oldV, value.data(), value.length());
  113. oldV[value.length()] = 0;
  114. fld.Trunc(value.length());
  115. return false;
  116. } while (false);
  117. FldMarkDirty(field);
  118. }
  119. FldSetNoDirty(field, value);
  120. return true;
  121. }
  122. /********************************************************/
  123. void TUri::RewriteImpl() {
  124. size_t len = 0;
  125. for (int i = 0; i < FieldAllMAX; ++i) {
  126. const EField fld = EField(i);
  127. if (FldIsSet(fld))
  128. len += 1 + Fields[fld].length();
  129. }
  130. if (!len)
  131. Buffer.Clear();
  132. else {
  133. TBuffer newbuf;
  134. newbuf.Resize(len);
  135. TMemoryWriteBuffer out(newbuf.data(), newbuf.size());
  136. for (int i = 0; i < FieldAllMAX; ++i) {
  137. const EField fld = EField(i);
  138. if (!FldIsSet(fld))
  139. continue;
  140. const char* beg = out.Buf();
  141. const TStringBuf& val = Fields[fld];
  142. out << val;
  143. FldSetNoDirty(fld, TStringBuf(beg, val.length()));
  144. out << '\0';
  145. }
  146. Buffer = std::move(newbuf);
  147. }
  148. CheckMissingFields();
  149. FieldsDirty = 0;
  150. }
  151. void TUri::CheckMissingFields() {
  152. // if host is set but path is not...
  153. if (FldSetCmp(FlagPath | FlagHost, FlagHost))
  154. // ... and the scheme requires a path...
  155. if (GetSchemeInfo().FldReq & FlagPath)
  156. // ... set path
  157. FldSetNoDirty(FieldPath, TStringBuf("/"));
  158. }
  159. /********************************************************/
  160. void TUri::Merge(const TUri& base, int correctAbs) {
  161. if (base.Scheme == SchemeUnknown)
  162. return;
  163. if (!base.IsValidGlobal())
  164. return;
  165. const TStringBuf& selfscheme = GetField(FieldScheme);
  166. // basescheme is present since IsValidGlobal() succeeded
  167. const TStringBuf& basescheme = base.GetField(FieldScheme);
  168. const bool noscheme = !selfscheme.IsInited();
  169. if (!noscheme && !EqualNoCase(selfscheme, basescheme))
  170. return;
  171. const ui32 cleanFields = ~FieldsDirty;
  172. do {
  173. static constexpr TStringBuf rootPath = "/";
  174. if (noscheme) {
  175. if (!basescheme.empty()) {
  176. FldSetNoDirty(FieldScheme, basescheme);
  177. // check if it is canonical
  178. if (basescheme.data() != base.GetSchemeInfo().Str.data())
  179. FldMarkDirty(FieldScheme);
  180. }
  181. Scheme = base.Scheme;
  182. DefaultPort = base.DefaultPort;
  183. }
  184. if (!IsNull(FlagHost))
  185. break; // no merge
  186. FldTrySet(FieldHost, base);
  187. FldChkSet(FieldPort, base);
  188. Port = base.Port;
  189. if (noscheme && IsNull(FlagQuery) && IsNull(FlagPath))
  190. FldTrySet(FieldQuery, base);
  191. if (noscheme && IsNull(FlagHashBang) && IsNull(FlagPath))
  192. FldTrySet(FieldHashBang, base);
  193. if (IsNull(FlagAuth) && !base.IsNull(FlagAuth)) {
  194. FldChkSet(FieldUser, base);
  195. FldChkSet(FieldPass, base);
  196. }
  197. if (IsValidAbs())
  198. break;
  199. TStringBuf p0 = base.GetField(FieldPath);
  200. if (!p0.IsInited())
  201. p0 = rootPath;
  202. TStringBuf p1 = GetField(FieldPath);
  203. if (!p1.IsInited()) {
  204. if (p0.data() != rootPath.data())
  205. FldSet(FieldPath, p0);
  206. else
  207. FldSetNoDirty(FieldPath, rootPath);
  208. break;
  209. }
  210. if (p1 && '/' == p1[0])
  211. p1.Skip(1); // p0 will have one
  212. bool pathop = true;
  213. TTempBufOutput out(p0.length() + p1.length() + 4);
  214. out << p0;
  215. if ('/' != p0.back())
  216. out << "/../";
  217. else if (p1.empty() || '.' != p1[0])
  218. pathop = false;
  219. out << p1;
  220. char* beg = out.Data();
  221. char* end = beg + out.Filled();
  222. if (pathop && !PathOperation(beg, end, correctAbs)) {
  223. Clear();
  224. break;
  225. }
  226. // Needs immediate forced rewrite because of TTempBuf
  227. FldSetNoDirty(FieldPath, TStringBuf(beg, end));
  228. RewriteImpl();
  229. } while (false);
  230. CheckMissingFields();
  231. // rewrite only if borrowed fields from base
  232. if (cleanFields & FieldsDirty)
  233. RewriteImpl();
  234. }
  235. /********************************************************/
  236. TUri::TLinkType TUri::Normalize(const TUri& base,
  237. const TStringBuf& link, const TStringBuf& codebase, ui64 careFlags, ECharset enc) {
  238. // parse URL
  239. if (ParsedOK != ParseImpl(link, careFlags, 0, SchemeEmpty, enc))
  240. return LinkIsBad;
  241. const TStringBuf& host = GetHost();
  242. // merge with base URL
  243. // taken either from _BASE_ property or from optional argument
  244. if (!codebase.empty()) {
  245. // if optional code base given -- parse it
  246. TUri codebaseUrl;
  247. if (codebaseUrl.ParseImpl(codebase, careFlags, 0, SchemeEmpty, enc) != ParsedOK || !codebaseUrl.IsValidAbs())
  248. return LinkIsBad;
  249. Merge(codebaseUrl);
  250. } else {
  251. // Base is already in this variable
  252. // see SetProperty() for details
  253. Merge(base);
  254. }
  255. // check result: must be correct absolute URL
  256. if (!IsValidAbs())
  257. return LinkBadAbs;
  258. if (!host.empty()) {
  259. // - we don't care about different ports for the same server
  260. // - we don't care about win|www|koi|etc. preffixes for the same server
  261. if (GetPort() != base.GetPort() || !EqualNoCase(host, base.GetHost()))
  262. return LinkIsGlobal;
  263. }
  264. // find out if it is link to itself then ignore it
  265. if (!Compare(base, FlagPath | FlagQuery | FlagHashBang))
  266. return LinkIsFragment;
  267. return LinkIsLocal;
  268. }
  269. /********************************************************/
  270. size_t TUri::PrintSize(ui32 flags) const {
  271. size_t len = 10;
  272. flags &= FieldsSet; // can't output what we don't have
  273. if (flags & FlagHostAscii)
  274. flags &= ~FlagHost; // don't want to print both of them
  275. ui32 opt = 1;
  276. for (int fld = 0; opt <= flags && fld < FieldAllMAX; ++fld, opt <<= 1) {
  277. if (opt & flags) {
  278. const TStringBuf& v = Fields[fld];
  279. if (v.IsInited()) {
  280. if (opt & FlagAuth)
  281. len += 3 * v.length() + 1;
  282. else
  283. len += v.length() + 1;
  284. }
  285. }
  286. }
  287. return len;
  288. }
  289. IOutputStream& TUri::PrintImpl(IOutputStream& out, int flags) const {
  290. TStringBuf v;
  291. const int wantFlags = flags; // save the original
  292. flags &= FieldsSet; // can't print what we don't have
  293. if (flags & FlagHostAscii)
  294. flags |= FlagHost; // to make host checks simpler below
  295. if (flags & FlagScheme) {
  296. v = Fields[FieldScheme];
  297. if (!v.empty())
  298. out << v << ':';
  299. }
  300. TStringBuf host;
  301. if (flags & FlagHost) {
  302. const EField fldhost =
  303. flags & FlagHostAscii ? FieldHostAscii : FieldHost;
  304. host = Fields[fldhost];
  305. }
  306. TStringBuf port;
  307. if ((flags & FlagPort) && 0 != Port && Port != DefaultPort)
  308. port = Fields[FieldPort];
  309. if (host) {
  310. if (wantFlags & FlagScheme)
  311. out << "//";
  312. if (flags & FlagAuth) {
  313. if (flags & FlagUser) {
  314. v = Fields[FieldUser];
  315. if (!v.empty())
  316. TEncoder::EncodeNotAlnum(out, v);
  317. }
  318. if (flags & FlagPass) {
  319. v = Fields[FieldPass];
  320. if (v.IsInited()) {
  321. out << ':';
  322. TEncoder::EncodeAll(out, v);
  323. }
  324. }
  325. out << '@';
  326. }
  327. out << host;
  328. if (port)
  329. out << ':';
  330. }
  331. if (port)
  332. out << port;
  333. if (flags & FlagPath) {
  334. v = Fields[FieldPath];
  335. // for relative, empty path is not the same as missing
  336. if (v.empty() && 0 == (flags & FlagHost))
  337. v = TStringBuf(".");
  338. out << v;
  339. }
  340. if (flags & FlagQuery) {
  341. v = Fields[FieldQuery];
  342. if (v.IsInited())
  343. out << '?' << v;
  344. }
  345. if (flags & FlagFrag) {
  346. v = Fields[FieldFrag];
  347. if (v.IsInited())
  348. out << '#' << v;
  349. }
  350. if (flags & FlagHashBang) {
  351. v = Fields[FieldHashBang];
  352. if (v.IsInited())
  353. out << '#' << '!' << v;
  354. }
  355. return out;
  356. }
  357. /********************************************************/
  358. int TUri::CompareField(EField fld, const TUri& url) const {
  359. const TStringBuf& v0 = GetField(fld);
  360. const TStringBuf& v1 = url.GetField(fld);
  361. switch (fld) {
  362. case FieldScheme:
  363. case FieldHost:
  364. return CompareNoCase(v0, v1);
  365. default:
  366. return v0.compare(v1);
  367. }
  368. }
  369. /********************************************************/
  370. int TUri::Compare(const TUri& url, int flags) const {
  371. // first compare fields with default values
  372. if (flags & FlagPort) {
  373. const int ret = GetPort() - url.GetPort();
  374. if (ret)
  375. return ret;
  376. flags &= ~FlagPort;
  377. }
  378. // compare remaining sets of available fields
  379. const int rtflags = flags & url.FieldsSet;
  380. flags &= FieldsSet;
  381. const int fldcmp = flags - rtflags;
  382. if (fldcmp)
  383. return fldcmp;
  384. // field sets are the same, compare the fields themselves
  385. for (int i = 0; i < FieldAllMAX; ++i) {
  386. const EField fld = EField(i);
  387. if (flags & FldFlag(fld)) {
  388. const int ret = CompareField(fld, url);
  389. if (ret)
  390. return ret;
  391. }
  392. }
  393. return 0;
  394. }
  395. /********************************************************/
  396. bool TUri::PathOperation(char*& pathPtr, char*& pathEnd, int correctAbs) {
  397. if (!pathPtr)
  398. return false;
  399. if (pathPtr == pathEnd)
  400. return true;
  401. if ((pathEnd - pathPtr) >= 2 && *(pathEnd - 2) == '/' && *(pathEnd - 1) == '.') {
  402. --pathEnd;
  403. }
  404. char* p_wr = pathEnd;
  405. int upCount = 0;
  406. char* p_prev = pathEnd;
  407. Y_ASSERT(p_prev > pathPtr);
  408. while (p_prev > pathPtr && *(p_prev - 1) == '/')
  409. p_prev--;
  410. for (char* p_rd = p_prev; p_rd; p_rd = p_prev) {
  411. Y_ASSERT(p_rd == pathEnd || p_rd[0] == '/');
  412. p_prev = nullptr;
  413. char* p = p_rd;
  414. if (p > pathPtr) {
  415. for (p--; *p != '/'; p--) {
  416. if (p == pathPtr)
  417. break;
  418. }
  419. if (*p == '/') {
  420. p_prev = p++;
  421. if ((p_prev - pathPtr >= 6 && !strnicmp(p_prev - 6, "http://", 7)) ||
  422. (p_prev - pathPtr >= 7 && !strnicmp(p_prev - 7, "https://", 8))) {
  423. --p_prev;
  424. --p;
  425. } else {
  426. //skip multiple from head '/'
  427. while (p_prev > pathPtr && *(p_prev - 1) == '/')
  428. p_prev--;
  429. }
  430. }
  431. }
  432. Y_ASSERT(p_prev == nullptr || p_prev[0] == '/');
  433. //and the first symbol !='/' after p_prev is p
  434. if (p == p_rd) {
  435. //empty block:
  436. if (p_prev) { //either tail:
  437. Y_ASSERT(p_rd == p_wr && *(p - 1) == '/');
  438. --p_wr;
  439. continue;
  440. } else { //or head of abs path
  441. *(--p_wr) = '/';
  442. break;
  443. }
  444. }
  445. if (p[0] == '.') {
  446. if (p + 1 == p_rd) {
  447. if (correctAbs || p_prev > pathPtr || pathPtr[0] != '/')
  448. // ignore "./"
  449. continue;
  450. } else {
  451. if ((p[1] == '.') && (p + 2 == p_rd)) {
  452. // register "../" but not print
  453. upCount++;
  454. continue;
  455. }
  456. }
  457. }
  458. if (upCount) {
  459. //unregister "../" and not print
  460. upCount--;
  461. continue;
  462. }
  463. // print
  464. Y_ASSERT(p < p_rd);
  465. Y_ASSERT(!p_prev || *(p - 1) == '/');
  466. if (p_wr == p_rd) { //just skip
  467. p_wr = p;
  468. } else { //copy
  469. int l = p_rd - p + 1;
  470. p_wr -= l;
  471. memmove(p_wr, p, l);
  472. }
  473. }
  474. if (upCount) {
  475. if (*pathPtr != '/') {
  476. if (pathEnd == p_wr && *(p_wr - 1) == '.') {
  477. Y_ASSERT(*(p_wr - 2) == '.');
  478. p_wr -= 2;
  479. upCount--;
  480. }
  481. for (; upCount > 0; upCount--) {
  482. *(--p_wr) = '/';
  483. *(--p_wr) = '.';
  484. *(--p_wr) = '.';
  485. }
  486. } else {
  487. if (correctAbs > 0)
  488. return false;
  489. if (correctAbs == 0) {
  490. //Bad path but present in RFC:
  491. // "Similarly, parsers must avoid treating "." and ".."
  492. // as special when they are not complete components of
  493. // a relative path. "
  494. for (; upCount > 0; upCount--) {
  495. *(--p_wr) = '.';
  496. *(--p_wr) = '.';
  497. *(--p_wr) = '/';
  498. }
  499. } else {
  500. upCount = false;
  501. }
  502. }
  503. }
  504. Y_ASSERT(p_wr >= pathPtr);
  505. if (upCount)
  506. return false;
  507. pathPtr = p_wr;
  508. return true;
  509. }
  510. /********************************************************/
  511. const char* LinkTypeToString(const TUri::TLinkType& t) {
  512. switch (t) {
  513. case TUri::LinkIsBad:
  514. return "LinkIsBad";
  515. case TUri::LinkBadAbs:
  516. return "LinkBadAbs";
  517. case TUri::LinkIsFragment:
  518. return "LinkIsFragment";
  519. case TUri::LinkIsLocal:
  520. return "LinkIsLocal";
  521. case TUri::LinkIsGlobal:
  522. return "LinkIsGlobal";
  523. }
  524. Y_ASSERT(0);
  525. return "";
  526. }
  527. }