fetch.cpp 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. #include "fetch.h"
  2. #include <yql/essentials/utils/log/log.h>
  3. #include <library/cpp/openssl/io/stream.h>
  4. #include <library/cpp/http/misc/httpcodes.h>
  5. #include <library/cpp/charset/ci_string.h>
  6. #include <util/network/socket.h>
  7. #include <util/string/cast.h>
  8. #include <util/generic/strbuf.h>
  9. namespace NYql {
  10. namespace {
  11. THttpURL ParseURL(const TStringBuf addr, NUri::TParseFlags features) {
  12. THttpURL url;
  13. THttpURL::TParsedState parsedState = url.Parse(addr, features, nullptr, 65536);
  14. if (THttpURL::ParsedOK != parsedState) {
  15. ythrow yexception() << "Bad URL: \"" << addr << "\", " << HttpURLParsedStateToString(parsedState);
  16. }
  17. return url;
  18. }
  19. class TFetchResultImpl: public IFetchResult {
  20. public:
  21. TFetchResultImpl(const THttpURL& url, const THttpHeaders& additionalHeaders, TDuration timeout) {
  22. TString host = url.Get(THttpURL::FieldHost);
  23. TString path = url.PrintS(THttpURL::FlagPath | THttpURL::FlagQuery);
  24. const char* p = url.Get(THttpURL::FieldPort);
  25. ui16 port = 80;
  26. bool https = false;
  27. if (url.Get(THttpURL::FieldScheme) == TStringBuf("https")) {
  28. port = 443;
  29. https = true;
  30. }
  31. if (p) {
  32. port = FromString<ui16>(p);
  33. }
  34. TString req;
  35. {
  36. TStringOutput rqs(req);
  37. TStringBuf userAgent = "User-Agent: Mozilla/5.0 (compatible; YQL/1.0)";
  38. IOutputStream::TPart request[] = {
  39. IOutputStream::TPart("GET ", 4),
  40. IOutputStream::TPart(path.data(), path.size()),
  41. IOutputStream::TPart(" HTTP/1.1", 9),
  42. IOutputStream::TPart::CrLf(),
  43. IOutputStream::TPart("Host: ", 6),
  44. IOutputStream::TPart(host.data(), host.size()),
  45. IOutputStream::TPart::CrLf(),
  46. IOutputStream::TPart(userAgent.data(), userAgent.size()),
  47. IOutputStream::TPart::CrLf(),
  48. };
  49. rqs.Write(request, Y_ARRAY_SIZE(request));
  50. if (!additionalHeaders.Empty()) {
  51. additionalHeaders.OutTo(&rqs);
  52. }
  53. rqs << "\r\n";
  54. }
  55. Socket.Reset(new TSocket(TNetworkAddress(host, port), timeout));
  56. SocketInput.Reset(new TSocketInput(*Socket));
  57. SocketOutput.Reset(new TSocketOutput(*Socket));
  58. Socket->SetSocketTimeout(timeout.Seconds(), timeout.MilliSeconds() % 1000);
  59. if (https) {
  60. Ssl.Reset(new TOpenSslClientIO(SocketInput.Get(), SocketOutput.Get()));
  61. }
  62. {
  63. THttpOutput ho(Ssl ? (IOutputStream*)Ssl.Get() : (IOutputStream*)SocketOutput.Get());
  64. (ho << req).Finish();
  65. }
  66. HttpInput.Reset(new THttpInput(Ssl ? (IInputStream*)Ssl.Get() : (IInputStream*)SocketInput.Get()));
  67. }
  68. THttpInput& GetStream() override {
  69. return *HttpInput;
  70. }
  71. unsigned GetRetCode() override {
  72. return ParseHttpRetCode(HttpInput->FirstLine());
  73. }
  74. THttpURL GetRedirectURL(const THttpURL& baseUrl) override {
  75. for (auto i = HttpInput->Headers().Begin(); i != HttpInput->Headers().End(); ++i) {
  76. if (0 == TCiString::compare(i->Name(), TStringBuf("location"))) {
  77. THttpURL target = ParseURL(i->Value(), THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN);
  78. if (!target.IsValidAbs()) {
  79. target.Merge(baseUrl);
  80. }
  81. return target;
  82. }
  83. }
  84. ythrow yexception() << "Unknown redirect location from " << baseUrl.PrintS();
  85. }
  86. static TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout) {
  87. return new TFetchResultImpl(url, additionalHeaders, timeout);
  88. }
  89. private:
  90. THolder<TSocket> Socket;
  91. THolder<TSocketInput> SocketInput;
  92. THolder<TSocketOutput> SocketOutput;
  93. THolder<TOpenSslClientIO> Ssl;
  94. THolder<THttpInput> HttpInput;
  95. };
  96. inline bool IsRedirectCode(unsigned code) {
  97. switch (code) {
  98. case HTTP_MOVED_PERMANENTLY:
  99. case HTTP_FOUND:
  100. case HTTP_SEE_OTHER:
  101. case HTTP_TEMPORARY_REDIRECT:
  102. return true;
  103. }
  104. return false;
  105. }
  106. inline bool IsRetryCode(unsigned code) {
  107. switch (code) {
  108. case HTTP_REQUEST_TIME_OUT:
  109. case HTTP_AUTHENTICATION_TIMEOUT:
  110. case HTTP_TOO_MANY_REQUESTS:
  111. case HTTP_GATEWAY_TIME_OUT:
  112. case HTTP_SERVICE_UNAVAILABLE:
  113. return true;
  114. }
  115. return false;
  116. }
  117. } // unnamed
  118. THttpURL ParseURL(const TStringBuf addr) {
  119. return ParseURL(addr, THttpURL::FeaturesAll | NUri::TFeature::FeatureConvertHostIDN | NUri::TFeature::FeatureNoRelPath);
  120. }
  121. TFetchResultPtr Fetch(const THttpURL& url, const THttpHeaders& additionalHeaders, const TDuration& timeout, size_t retries, size_t redirects) {
  122. THttpURL currentUrl = url;
  123. for (size_t fetchNum = 0; fetchNum < redirects; ++fetchNum) {
  124. unsigned responseCode = 0;
  125. TFetchResultPtr fr;
  126. size_t fetchTry = 0;
  127. do {
  128. fr = TFetchResultImpl::Fetch(currentUrl, additionalHeaders, timeout);
  129. responseCode = fr->GetRetCode();
  130. } while (IsRetryCode(responseCode) && ++fetchTry < retries);
  131. if (responseCode >= 200 && responseCode < 300) {
  132. return fr;
  133. }
  134. if (responseCode == HTTP_NOT_MODIFIED) {
  135. return fr;
  136. }
  137. if (IsRedirectCode(responseCode)) {
  138. currentUrl = fr->GetRedirectURL(currentUrl);
  139. YQL_LOG(INFO) << "Got redirect to " << currentUrl.PrintS();
  140. continue;
  141. }
  142. TString errorBody;
  143. try {
  144. errorBody = fr->GetStream().ReadAll();
  145. } catch (...) {
  146. }
  147. ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "' with code " << responseCode << ", body: " << errorBody;
  148. }
  149. ythrow yexception() << "Failed to fetch url '" << currentUrl.PrintS() << "': too many redirects";
  150. }
  151. } // NYql