http_parser.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #include "http_parser.h"
  2. #include <library/cpp/blockcodecs/stream.h>
  3. #include <library/cpp/blockcodecs/codecs.h>
  4. #include <library/cpp/streams/brotli/brotli.h>
  5. #include <util/generic/string.h>
  6. #include <util/generic/yexception.h>
  7. #include <util/stream/mem.h>
  8. #include <util/stream/zlib.h>
  9. #include <util/string/ascii.h>
  10. #include <util/string/split.h>
  11. #include <util/string/strip.h>
  12. //#define DBGOUT(args) Cout << args << Endl;
  13. #define DBGOUT(args)
  14. namespace {
  15. const TString BestCodings[] = {
  16. "gzip",
  17. "deflate",
  18. "br",
  19. "x-gzip",
  20. "x-deflate",
  21. "y-lzo",
  22. "y-lzf",
  23. "y-lzq",
  24. "y-bzip2",
  25. "y-lzma",
  26. };
  27. }
  28. TString THttpParser::GetBestCompressionScheme() const {
  29. if (AcceptEncodings_.contains("*")) {
  30. return BestCodings[0];
  31. }
  32. for (auto& coding : BestCodings) {
  33. if (AcceptEncodings_.contains(coding)) {
  34. return coding;
  35. }
  36. }
  37. return TString();
  38. }
  39. const THashSet<TString>& THttpParser::AcceptedEncodings() const {
  40. return AcceptEncodings_;
  41. }
  42. bool THttpParser::FirstLineParser() {
  43. if (Y_UNLIKELY(!ReadLine())) {
  44. return false;
  45. }
  46. CurrentLine_.swap(FirstLine_);
  47. try {
  48. TStringBuf s(FirstLine_);
  49. if (MessageType_ == Response) {
  50. // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
  51. TStringBuf httpVersion, statusCode;
  52. GetNext(s, ' ', httpVersion);
  53. ParseHttpVersion(httpVersion);
  54. GetNext(s, ' ', statusCode);
  55. RetCode_ = FromString<unsigned>(statusCode);
  56. } else {
  57. // Request-Line = Method SP Request-URI SP HTTP-Version CRLF
  58. TStringBuf httpVersion = s.After(' ').After(' ');
  59. ParseHttpVersion(httpVersion);
  60. }
  61. } catch (...) {
  62. throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
  63. }
  64. return HeadersParser();
  65. }
  66. bool THttpParser::HeadersParser() {
  67. while (ReadLine()) {
  68. if (!CurrentLine_) {
  69. //end of headers
  70. DBGOUT("end of headers()");
  71. ParseHeaderLine();
  72. if (HasContentLength_) {
  73. if (ContentLength_ == 0) {
  74. return OnEndParsing();
  75. }
  76. if (ContentLength_ < 1000000) {
  77. Content_.reserve(ContentLength_ + 1);
  78. }
  79. }
  80. return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
  81. }
  82. if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
  83. //continue previous header-line
  84. HeaderLine_ += CurrentLine_;
  85. CurrentLine_.remove(0);
  86. } else {
  87. ParseHeaderLine();
  88. HeaderLine_.swap(CurrentLine_);
  89. }
  90. }
  91. Parser_ = &THttpParser::HeadersParser;
  92. return false;
  93. }
  94. bool THttpParser::ContentParser() {
  95. DBGOUT("Content parsing()");
  96. if (HasContentLength_ && !BodyNotExpected_) {
  97. size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
  98. Content_.append(Data_, rd);
  99. Data_ += rd;
  100. DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
  101. if (Content_.size() == ContentLength_) {
  102. return OnEndParsing();
  103. }
  104. } else {
  105. if (MessageType_ == Request) {
  106. return OnEndParsing(); //RFC2616 4.4-5
  107. } else if (Y_UNLIKELY(BodyNotExpected_ || RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
  108. return OnEndParsing(); //RFC2616 4.4-1
  109. }
  110. Content_.append(Data_, DataEnd_);
  111. Data_ = DataEnd_;
  112. }
  113. Parser_ = &THttpParser::ContentParser;
  114. return false;
  115. }
  116. bool THttpParser::ChunkedContentParser() {
  117. DBGOUT("ReadChunkedContent");
  118. TChunkInputState& ci = *ChunkInputState_;
  119. if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
  120. //try reduce memory reallocations
  121. Content_.reserve(DataEnd_ - Data_);
  122. }
  123. do {
  124. if (!ci.LeftBytes_) {
  125. if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
  126. break;
  127. }
  128. if (Y_UNLIKELY(ci.ReadLastChunk_)) {
  129. return OnEndParsing();
  130. }
  131. if (!CurrentLine_) {
  132. // skip crlf from previous chunk
  133. if (!ReadLine()) {
  134. break;
  135. }
  136. }
  137. Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
  138. size_t size = CurrentLine_.find_first_of(" \t;");
  139. if (size == TString::npos) {
  140. size = CurrentLine_.size();
  141. }
  142. ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
  143. CurrentLine_.remove(0);
  144. if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
  145. ci.ReadLastChunk_ = true;
  146. if (ReadLine()) {
  147. return OnEndParsing();
  148. } else {
  149. break;
  150. }
  151. }
  152. }
  153. size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
  154. Content_.append(Data_, rd);
  155. Data_ += rd;
  156. ci.LeftBytes_ -= rd;
  157. } while (Data_ != DataEnd_);
  158. Parser_ = &THttpParser::ChunkedContentParser;
  159. return false;
  160. }
  161. bool THttpParser::OnEndParsing() {
  162. Parser_ = &THttpParser::OnEndParsing;
  163. ExtraDataSize_ = DataEnd_ - Data_;
  164. return true;
  165. }
  166. //continue read to CurrentLine_
  167. bool THttpParser::ReadLine() {
  168. TStringBuf in(Data_, DataEnd_);
  169. size_t endl = in.find('\n');
  170. if (Y_UNLIKELY(endl == TStringBuf::npos)) {
  171. //input line not completed
  172. CurrentLine_.append(Data_, DataEnd_);
  173. return false;
  174. }
  175. CurrentLine_.append(in.data(), endl);
  176. if (Y_LIKELY(CurrentLine_.size())) {
  177. //remove '\r' from tail
  178. size_t withoutCR = CurrentLine_.size() - 1;
  179. if (CurrentLine_[withoutCR] == '\r') {
  180. CurrentLine_.remove(withoutCR);
  181. }
  182. }
  183. //Cout << "ReadLine:" << CurrentLine_ << Endl;
  184. Data_ += endl + 1;
  185. return true;
  186. }
  187. void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
  188. if (!httpVersion.StartsWith("HTTP/", 5)) {
  189. throw yexception() << "expect 'HTTP/'";
  190. }
  191. httpVersion.Skip(5);
  192. {
  193. TStringBuf major, minor;
  194. Split(httpVersion, '.', major, minor);
  195. HttpVersion_.Major = FromString<unsigned>(major);
  196. HttpVersion_.Minor = FromString<unsigned>(minor);
  197. if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
  198. // since HTTP/1.1 Keep-Alive is default behaviour
  199. KeepAlive_ = true;
  200. }
  201. }
  202. }
  203. void THttpParser::ParseHeaderLine() {
  204. if (!!HeaderLine_) {
  205. if (CollectHeaders_) {
  206. THttpInputHeader hdr(HeaderLine_);
  207. Headers_.AddHeader(hdr);
  208. ApplyHeaderLine(hdr.Name(), hdr.Value());
  209. } else {
  210. //some dirty optimization (avoid reallocation new strings)
  211. size_t pos = HeaderLine_.find(':');
  212. if (pos == TString::npos) {
  213. ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
  214. }
  215. TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
  216. TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
  217. ApplyHeaderLine(name, val);
  218. }
  219. HeaderLine_.remove(0);
  220. }
  221. }
  222. void THttpParser::OnEof() {
  223. if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
  224. return; //end of content determined by end of input
  225. }
  226. throw THttpException() << TStringBuf("incompleted http response");
  227. }
  228. bool THttpParser::DecodeContent(TString& decodedContent) const {
  229. if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
  230. decodedContent = Content_;
  231. return false;
  232. }
  233. TMemoryInput in(Content_.data(), Content_.size());
  234. if (ContentEncoding_ == "gzip") {
  235. auto decompressor = TZLibDecompress(&in, ZLib::GZip);
  236. if (!GzipAllowMultipleStreams_) {
  237. decompressor.SetAllowMultipleStreams(false);
  238. }
  239. decodedContent = decompressor.ReadAll();
  240. } else if (ContentEncoding_ == "deflate") {
  241. //https://tools.ietf.org/html/rfc1950
  242. bool definitelyNoZlibHeader;
  243. if (Content_.size() < 2) {
  244. definitelyNoZlibHeader = true;
  245. } else {
  246. const ui16 cmf = static_cast<ui8>(Content_[0]);
  247. const ui16 flg = static_cast<ui8>(Content_[1]);
  248. definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
  249. }
  250. try {
  251. decodedContent = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
  252. }
  253. catch(...) {
  254. if (definitelyNoZlibHeader) {
  255. throw;
  256. }
  257. TMemoryInput retryInput(Content_.data(), Content_.size());
  258. decodedContent = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
  259. }
  260. } else if (ContentEncoding_.StartsWith("z-")) {
  261. // opposite for library/cpp/http/io/stream.h
  262. const NBlockCodecs::ICodec* codec = nullptr;
  263. try {
  264. const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
  265. if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
  266. ythrow NBlockCodecs::TNotFound() << codecName;
  267. }
  268. codec = NBlockCodecs::Codec(codecName);
  269. } catch(const NBlockCodecs::TNotFound& exc) {
  270. throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
  271. }
  272. NBlockCodecs::TDecodedInput decoder(&in, codec);
  273. decodedContent = decoder.ReadAll();
  274. } else if (ContentEncoding_ == "lz4") {
  275. const auto* codec = NBlockCodecs::Codec(TStringBuf(ContentEncoding_));
  276. decodedContent = codec->Decode(Content_);
  277. } else if (ContentEncoding_ == "br") {
  278. TBrotliDecompress decoder(&in);
  279. decodedContent = decoder.ReadAll();
  280. } else {
  281. throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
  282. }
  283. return true;
  284. }
  285. void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
  286. if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
  287. KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
  288. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
  289. Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
  290. ContentLength_ = FromString<ui64>(val);
  291. HasContentLength_ = true;
  292. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
  293. if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
  294. ChunkInputState_ = new TChunkInputState();
  295. }
  296. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
  297. TStringBuf encodings(val);
  298. while (encodings.size()) {
  299. TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
  300. if (!enc) {
  301. continue;
  302. }
  303. TString s(enc);
  304. s.to_lower();
  305. AcceptEncodings_.insert(s);
  306. }
  307. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
  308. TString s(val);
  309. s.to_lower();
  310. ContentEncoding_ = s;
  311. }
  312. }