#include "http_parser.h" #include #include #include #include #include #include #include #include #include #include //#define DBGOUT(args) Cout << args << Endl; #define DBGOUT(args) namespace { const TString BestCodings[] = { "gzip", "deflate", "br", "x-gzip", "x-deflate", "y-lzo", "y-lzf", "y-lzq", "y-bzip2", "y-lzma", }; } TString THttpParser::GetBestCompressionScheme() const { if (AcceptEncodings_.contains("*")) { return BestCodings[0]; } for (auto& coding : BestCodings) { if (AcceptEncodings_.contains(coding)) { return coding; } } return TString(); } bool THttpParser::FirstLineParser() { if (Y_UNLIKELY(!ReadLine())) { return false; } CurrentLine_.swap(FirstLine_); try { TStringBuf s(FirstLine_); if (MessageType_ == Response) { // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF TStringBuf httpVersion, statusCode; GetNext(s, ' ', httpVersion); ParseHttpVersion(httpVersion); GetNext(s, ' ', statusCode); RetCode_ = FromString(statusCode); } else { // Request-Line = Method SP Request-URI SP HTTP-Version CRLF TStringBuf httpVersion = s.After(' ').After(' '); ParseHttpVersion(httpVersion); } } catch (...) { throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min(80ull, FirstLine_.size())).Quote(); } return HeadersParser(); } bool THttpParser::HeadersParser() { while (ReadLine()) { if (!CurrentLine_) { //end of headers DBGOUT("end of headers()"); ParseHeaderLine(); if (HasContentLength_) { if (ContentLength_ == 0) { return OnEndParsing(); } if (ContentLength_ < 1000000) { Content_.reserve(ContentLength_ + 1); } } return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser(); } if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') { //continue previous header-line HeaderLine_ += CurrentLine_; CurrentLine_.remove(0); } else { ParseHeaderLine(); HeaderLine_.swap(CurrentLine_); } } Parser_ = &THttpParser::HeadersParser; return false; } bool THttpParser::ContentParser() { DBGOUT("Content parsing()"); if (HasContentLength_ && !BodyNotExpected_) { size_t rd = Min(DataEnd_ - Data_, ContentLength_ - Content_.size()); Content_.append(Data_, rd); Data_ += rd; DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_); if (Content_.size() == ContentLength_) { return OnEndParsing(); } } else { if (MessageType_ == Request) { return OnEndParsing(); //RFC2616 4.4-5 } else if (Y_UNLIKELY(BodyNotExpected_ || RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) { return OnEndParsing(); //RFC2616 4.4-1 } Content_.append(Data_, DataEnd_); Data_ = DataEnd_; } Parser_ = &THttpParser::ContentParser; return false; } bool THttpParser::ChunkedContentParser() { DBGOUT("ReadChunkedContent"); TChunkInputState& ci = *ChunkInputState_; if (Content_.capacity() < static_cast(DataEnd_ - Data_)) { //try reduce memory reallocations Content_.reserve(DataEnd_ - Data_); } do { if (!ci.LeftBytes_) { if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk break; } if (Y_UNLIKELY(ci.ReadLastChunk_)) { return OnEndParsing(); } if (!CurrentLine_) { // skip crlf from previous chunk if (!ReadLine()) { break; } } Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. "); size_t size = CurrentLine_.find_first_of(" \t;"); if (size == TString::npos) { size = CurrentLine_.size(); } ci.LeftBytes_ = IntFromString(CurrentLine_.c_str(), size); CurrentLine_.remove(0); if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk ci.ReadLastChunk_ = true; if (ReadLine()) { return OnEndParsing(); } else { break; } } } size_t rd = Min(DataEnd_ - Data_, ci.LeftBytes_); Content_.append(Data_, rd); Data_ += rd; ci.LeftBytes_ -= rd; } while (Data_ != DataEnd_); Parser_ = &THttpParser::ChunkedContentParser; return false; } bool THttpParser::OnEndParsing() { Parser_ = &THttpParser::OnEndParsing; ExtraDataSize_ = DataEnd_ - Data_; return true; } //continue read to CurrentLine_ bool THttpParser::ReadLine() { TStringBuf in(Data_, DataEnd_); size_t endl = in.find('\n'); if (Y_UNLIKELY(endl == TStringBuf::npos)) { //input line not completed CurrentLine_.append(Data_, DataEnd_); return false; } CurrentLine_.append(in.data(), endl); if (Y_LIKELY(CurrentLine_.size())) { //remove '\r' from tail size_t withoutCR = CurrentLine_.size() - 1; if (CurrentLine_[withoutCR] == '\r') { CurrentLine_.remove(withoutCR); } } //Cout << "ReadLine:" << CurrentLine_ << Endl; Data_ += endl + 1; return true; } void THttpParser::ParseHttpVersion(TStringBuf httpVersion) { if (!httpVersion.StartsWith("HTTP/", 5)) { throw yexception() << "expect 'HTTP/'"; } httpVersion.Skip(5); { TStringBuf major, minor; Split(httpVersion, '.', major, minor); HttpVersion_.Major = FromString(major); HttpVersion_.Minor = FromString(minor); if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) { // since HTTP/1.1 Keep-Alive is default behaviour KeepAlive_ = true; } } } void THttpParser::ParseHeaderLine() { if (!!HeaderLine_) { if (CollectHeaders_) { THttpInputHeader hdr(HeaderLine_); Headers_.AddHeader(hdr); ApplyHeaderLine(hdr.Name(), hdr.Value()); } else { //some dirty optimization (avoid reallocation new strings) size_t pos = HeaderLine_.find(':'); if (pos == TString::npos) { ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")"; } TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos))); TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end()))); ApplyHeaderLine(name, val); } HeaderLine_.remove(0); } } void THttpParser::OnEof() { if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) { return; //end of content determined by end of input } throw THttpException() << TStringBuf("incompleted http response"); } bool THttpParser::DecodeContent() { if (!DecodeContent_) { return false; } if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") { DecodedContent_ = Content_; return false; } TMemoryInput in(Content_.data(), Content_.size()); if (ContentEncoding_ == "gzip") { auto decompressor = TZLibDecompress(&in, ZLib::GZip); if (!GzipAllowMultipleStreams_) { decompressor.SetAllowMultipleStreams(false); } DecodedContent_ = decompressor.ReadAll(); } else if (ContentEncoding_ == "deflate") { //https://tools.ietf.org/html/rfc1950 bool definitelyNoZlibHeader; if (Content_.size() < 2) { definitelyNoZlibHeader = true; } else { const ui16 cmf = static_cast(Content_[0]); const ui16 flg = static_cast(Content_[1]); definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0; } try { DecodedContent_ = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll(); } catch(...) { if (definitelyNoZlibHeader) { throw; } TMemoryInput retryInput(Content_.data(), Content_.size()); DecodedContent_ = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll(); } } else if (ContentEncoding_.StartsWith("z-")) { // opposite for library/cpp/http/io/stream.h const NBlockCodecs::ICodec* codec = nullptr; try { const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2); if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) { ythrow NBlockCodecs::TNotFound() << codecName; } codec = NBlockCodecs::Codec(codecName); } catch(const NBlockCodecs::TNotFound& exc) { throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf(); } NBlockCodecs::TDecodedInput decoder(&in, codec); DecodedContent_ = decoder.ReadAll(); } else if (ContentEncoding_ == "lz4") { const auto* codec = NBlockCodecs::Codec(TStringBuf(ContentEncoding_)); DecodedContent_ = codec->Decode(Content_); } else if (ContentEncoding_ == "br") { TBrotliDecompress decoder(&in); DecodedContent_ = decoder.ReadAll(); } else { throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_; } return true; } void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) { if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) { KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive")); } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) { Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. "); ContentLength_ = FromString(val); HasContentLength_ = true; } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) { if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) { ChunkInputState_ = new TChunkInputState(); } } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) { TStringBuf encodings(val); while (encodings.size()) { TStringBuf enc = encodings.NextTok(',').After(' ').Before(' '); if (!enc) { continue; } TString s(enc); s.to_lower(); AcceptEncodings_.insert(s); } } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) { TString s(val); s.to_lower(); ContentEncoding_ = s; } }