http_parser.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #include "http_parser.h"
  2. #include <library/cpp/blockcodecs/stream.h>
  3. #include <library/cpp/blockcodecs/codecs.h>
  4. #include <library/cpp/streams/brotli/brotli.h>
  5. #include <util/generic/string.h>
  6. #include <util/generic/yexception.h>
  7. #include <util/stream/mem.h>
  8. #include <util/stream/zlib.h>
  9. #include <util/string/ascii.h>
  10. #include <util/string/split.h>
  11. #include <util/string/strip.h>
  12. //#define DBGOUT(args) Cout << args << Endl;
  13. #define DBGOUT(args)
  14. namespace {
  15. const TString BestCodings[] = {
  16. "gzip",
  17. "deflate",
  18. "br",
  19. "x-gzip",
  20. "x-deflate",
  21. "y-lzo",
  22. "y-lzf",
  23. "y-lzq",
  24. "y-bzip2",
  25. "y-lzma",
  26. };
  27. }
  28. TString THttpParser::GetBestCompressionScheme() const {
  29. if (AcceptEncodings_.contains("*")) {
  30. return BestCodings[0];
  31. }
  32. for (auto& coding : BestCodings) {
  33. if (AcceptEncodings_.contains(coding)) {
  34. return coding;
  35. }
  36. }
  37. return TString();
  38. }
  39. bool THttpParser::FirstLineParser() {
  40. if (Y_UNLIKELY(!ReadLine())) {
  41. return false;
  42. }
  43. CurrentLine_.swap(FirstLine_);
  44. try {
  45. TStringBuf s(FirstLine_);
  46. if (MessageType_ == Response) {
  47. // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
  48. TStringBuf httpVersion, statusCode;
  49. GetNext(s, ' ', httpVersion);
  50. ParseHttpVersion(httpVersion);
  51. GetNext(s, ' ', statusCode);
  52. RetCode_ = FromString<unsigned>(statusCode);
  53. } else {
  54. // Request-Line = Method SP Request-URI SP HTTP-Version CRLF
  55. TStringBuf httpVersion = s.After(' ').After(' ');
  56. ParseHttpVersion(httpVersion);
  57. }
  58. } catch (...) {
  59. throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
  60. }
  61. return HeadersParser();
  62. }
  63. bool THttpParser::HeadersParser() {
  64. while (ReadLine()) {
  65. if (!CurrentLine_) {
  66. //end of headers
  67. DBGOUT("end of headers()");
  68. ParseHeaderLine();
  69. if (HasContentLength_) {
  70. if (ContentLength_ == 0) {
  71. return OnEndParsing();
  72. }
  73. if (ContentLength_ < 1000000) {
  74. Content_.reserve(ContentLength_ + 1);
  75. }
  76. }
  77. return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
  78. }
  79. if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
  80. //continue previous header-line
  81. HeaderLine_ += CurrentLine_;
  82. CurrentLine_.remove(0);
  83. } else {
  84. ParseHeaderLine();
  85. HeaderLine_.swap(CurrentLine_);
  86. }
  87. }
  88. Parser_ = &THttpParser::HeadersParser;
  89. return false;
  90. }
  91. bool THttpParser::ContentParser() {
  92. DBGOUT("Content parsing()");
  93. if (HasContentLength_ && !BodyNotExpected_) {
  94. size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
  95. Content_.append(Data_, rd);
  96. Data_ += rd;
  97. DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
  98. if (Content_.size() == ContentLength_) {
  99. return OnEndParsing();
  100. }
  101. } else {
  102. if (MessageType_ == Request) {
  103. return OnEndParsing(); //RFC2616 4.4-5
  104. } else if (Y_UNLIKELY(BodyNotExpected_ || RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
  105. return OnEndParsing(); //RFC2616 4.4-1
  106. }
  107. Content_.append(Data_, DataEnd_);
  108. Data_ = DataEnd_;
  109. }
  110. Parser_ = &THttpParser::ContentParser;
  111. return false;
  112. }
  113. bool THttpParser::ChunkedContentParser() {
  114. DBGOUT("ReadChunkedContent");
  115. TChunkInputState& ci = *ChunkInputState_;
  116. if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
  117. //try reduce memory reallocations
  118. Content_.reserve(DataEnd_ - Data_);
  119. }
  120. do {
  121. if (!ci.LeftBytes_) {
  122. if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
  123. break;
  124. }
  125. if (Y_UNLIKELY(ci.ReadLastChunk_)) {
  126. return OnEndParsing();
  127. }
  128. if (!CurrentLine_) {
  129. // skip crlf from previous chunk
  130. if (!ReadLine()) {
  131. break;
  132. }
  133. }
  134. Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
  135. size_t size = CurrentLine_.find_first_of(" \t;");
  136. if (size == TString::npos) {
  137. size = CurrentLine_.size();
  138. }
  139. ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
  140. CurrentLine_.remove(0);
  141. if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
  142. ci.ReadLastChunk_ = true;
  143. if (ReadLine()) {
  144. return OnEndParsing();
  145. } else {
  146. break;
  147. }
  148. }
  149. }
  150. size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
  151. Content_.append(Data_, rd);
  152. Data_ += rd;
  153. ci.LeftBytes_ -= rd;
  154. } while (Data_ != DataEnd_);
  155. Parser_ = &THttpParser::ChunkedContentParser;
  156. return false;
  157. }
  158. bool THttpParser::OnEndParsing() {
  159. Parser_ = &THttpParser::OnEndParsing;
  160. ExtraDataSize_ = DataEnd_ - Data_;
  161. return true;
  162. }
  163. //continue read to CurrentLine_
  164. bool THttpParser::ReadLine() {
  165. TStringBuf in(Data_, DataEnd_);
  166. size_t endl = in.find('\n');
  167. if (Y_UNLIKELY(endl == TStringBuf::npos)) {
  168. //input line not completed
  169. CurrentLine_.append(Data_, DataEnd_);
  170. return false;
  171. }
  172. CurrentLine_.append(in.data(), endl);
  173. if (Y_LIKELY(CurrentLine_.size())) {
  174. //remove '\r' from tail
  175. size_t withoutCR = CurrentLine_.size() - 1;
  176. if (CurrentLine_[withoutCR] == '\r') {
  177. CurrentLine_.remove(withoutCR);
  178. }
  179. }
  180. //Cout << "ReadLine:" << CurrentLine_ << Endl;
  181. Data_ += endl + 1;
  182. return true;
  183. }
  184. void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
  185. if (!httpVersion.StartsWith("HTTP/", 5)) {
  186. throw yexception() << "expect 'HTTP/'";
  187. }
  188. httpVersion.Skip(5);
  189. {
  190. TStringBuf major, minor;
  191. Split(httpVersion, '.', major, minor);
  192. HttpVersion_.Major = FromString<unsigned>(major);
  193. HttpVersion_.Minor = FromString<unsigned>(minor);
  194. if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
  195. // since HTTP/1.1 Keep-Alive is default behaviour
  196. KeepAlive_ = true;
  197. }
  198. }
  199. }
  200. void THttpParser::ParseHeaderLine() {
  201. if (!!HeaderLine_) {
  202. if (CollectHeaders_) {
  203. THttpInputHeader hdr(HeaderLine_);
  204. Headers_.AddHeader(hdr);
  205. ApplyHeaderLine(hdr.Name(), hdr.Value());
  206. } else {
  207. //some dirty optimization (avoid reallocation new strings)
  208. size_t pos = HeaderLine_.find(':');
  209. if (pos == TString::npos) {
  210. ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
  211. }
  212. TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
  213. TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
  214. ApplyHeaderLine(name, val);
  215. }
  216. HeaderLine_.remove(0);
  217. }
  218. }
  219. void THttpParser::OnEof() {
  220. if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
  221. return; //end of content determined by end of input
  222. }
  223. throw THttpException() << TStringBuf("incompleted http response");
  224. }
  225. bool THttpParser::DecodeContent() {
  226. if (!DecodeContent_) {
  227. return false;
  228. }
  229. if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
  230. DecodedContent_ = Content_;
  231. return false;
  232. }
  233. TMemoryInput in(Content_.data(), Content_.size());
  234. if (ContentEncoding_ == "gzip") {
  235. auto decompressor = TZLibDecompress(&in, ZLib::GZip);
  236. if (!GzipAllowMultipleStreams_) {
  237. decompressor.SetAllowMultipleStreams(false);
  238. }
  239. DecodedContent_ = decompressor.ReadAll();
  240. } else if (ContentEncoding_ == "deflate") {
  241. //https://tools.ietf.org/html/rfc1950
  242. bool definitelyNoZlibHeader;
  243. if (Content_.size() < 2) {
  244. definitelyNoZlibHeader = true;
  245. } else {
  246. const ui16 cmf = static_cast<ui8>(Content_[0]);
  247. const ui16 flg = static_cast<ui8>(Content_[1]);
  248. definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
  249. }
  250. try {
  251. DecodedContent_ = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
  252. }
  253. catch(...) {
  254. if (definitelyNoZlibHeader) {
  255. throw;
  256. }
  257. TMemoryInput retryInput(Content_.data(), Content_.size());
  258. DecodedContent_ = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
  259. }
  260. } else if (ContentEncoding_.StartsWith("z-")) {
  261. // opposite for library/cpp/http/io/stream.h
  262. const NBlockCodecs::ICodec* codec = nullptr;
  263. try {
  264. const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
  265. if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
  266. ythrow NBlockCodecs::TNotFound() << codecName;
  267. }
  268. codec = NBlockCodecs::Codec(codecName);
  269. } catch(const NBlockCodecs::TNotFound& exc) {
  270. throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
  271. }
  272. NBlockCodecs::TDecodedInput decoder(&in, codec);
  273. DecodedContent_ = decoder.ReadAll();
  274. } else if (ContentEncoding_ == "lz4") {
  275. const auto* codec = NBlockCodecs::Codec(TStringBuf(ContentEncoding_));
  276. DecodedContent_ = codec->Decode(Content_);
  277. } else if (ContentEncoding_ == "br") {
  278. TBrotliDecompress decoder(&in);
  279. DecodedContent_ = decoder.ReadAll();
  280. } else {
  281. throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
  282. }
  283. return true;
  284. }
  285. void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
  286. if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
  287. KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
  288. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
  289. Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
  290. ContentLength_ = FromString<ui64>(val);
  291. HasContentLength_ = true;
  292. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
  293. if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
  294. ChunkInputState_ = new TChunkInputState();
  295. }
  296. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
  297. TStringBuf encodings(val);
  298. while (encodings.size()) {
  299. TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
  300. if (!enc) {
  301. continue;
  302. }
  303. TString s(enc);
  304. s.to_lower();
  305. AcceptEncodings_.insert(s);
  306. }
  307. } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
  308. TString s(val);
  309. s.to_lower();
  310. ContentEncoding_ = s;
  311. }
  312. }