123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- #include "http_parser.h"
- #include <library/cpp/blockcodecs/stream.h>
- #include <library/cpp/blockcodecs/codecs.h>
- #include <library/cpp/streams/brotli/brotli.h>
- #include <util/generic/string.h>
- #include <util/generic/yexception.h>
- #include <util/stream/mem.h>
- #include <util/stream/zlib.h>
- #include <util/string/ascii.h>
- #include <util/string/split.h>
- #include <util/string/strip.h>
- //#define DBGOUT(args) Cout << args << Endl;
- #define DBGOUT(args)
- namespace {
- const TString BestCodings[] = {
- "gzip",
- "deflate",
- "br",
- "x-gzip",
- "x-deflate",
- "y-lzo",
- "y-lzf",
- "y-lzq",
- "y-bzip2",
- "y-lzma",
- };
- }
- TString THttpParser::GetBestCompressionScheme() const {
- if (AcceptEncodings_.contains("*")) {
- return BestCodings[0];
- }
- for (auto& coding : BestCodings) {
- if (AcceptEncodings_.contains(coding)) {
- return coding;
- }
- }
- return TString();
- }
- bool THttpParser::FirstLineParser() {
- if (Y_UNLIKELY(!ReadLine())) {
- return false;
- }
- CurrentLine_.swap(FirstLine_);
- try {
- TStringBuf s(FirstLine_);
- if (MessageType_ == Response) {
- // Status-Line = HTTP-Version SP Status-Code SP Reason-Phrase CRLF
- TStringBuf httpVersion, statusCode;
- GetNext(s, ' ', httpVersion);
- ParseHttpVersion(httpVersion);
- GetNext(s, ' ', statusCode);
- RetCode_ = FromString<unsigned>(statusCode);
- } else {
- // Request-Line = Method SP Request-URI SP HTTP-Version CRLF
- TStringBuf httpVersion = s.After(' ').After(' ');
- ParseHttpVersion(httpVersion);
- }
- } catch (...) {
- throw THttpParseException() << "Cannot parse first line: " << CurrentExceptionMessage() << " First 80 chars of line: " << FirstLine_.substr(0, Min<size_t>(80ull, FirstLine_.size())).Quote();
- }
- return HeadersParser();
- }
- bool THttpParser::HeadersParser() {
- while (ReadLine()) {
- if (!CurrentLine_) {
- //end of headers
- DBGOUT("end of headers()");
- ParseHeaderLine();
- if (HasContentLength_) {
- if (ContentLength_ == 0) {
- return OnEndParsing();
- }
- if (ContentLength_ < 1000000) {
- Content_.reserve(ContentLength_ + 1);
- }
- }
- return !!ChunkInputState_ ? ChunkedContentParser() : ContentParser();
- }
- if (CurrentLine_[0] == ' ' || CurrentLine_[0] == '\t') {
- //continue previous header-line
- HeaderLine_ += CurrentLine_;
- CurrentLine_.remove(0);
- } else {
- ParseHeaderLine();
- HeaderLine_.swap(CurrentLine_);
- }
- }
- Parser_ = &THttpParser::HeadersParser;
- return false;
- }
- bool THttpParser::ContentParser() {
- DBGOUT("Content parsing()");
- if (HasContentLength_ && !BodyNotExpected_) {
- size_t rd = Min<size_t>(DataEnd_ - Data_, ContentLength_ - Content_.size());
- Content_.append(Data_, rd);
- Data_ += rd;
- DBGOUT("Content parsing: " << Content_.Size() << " from " << ContentLength_);
- if (Content_.size() == ContentLength_) {
- return OnEndParsing();
- }
- } else {
- if (MessageType_ == Request) {
- return OnEndParsing(); //RFC2616 4.4-5
- } else if (Y_UNLIKELY(BodyNotExpected_ || RetCode() < 200 || RetCode() == 204 || RetCode() == 304)) {
- return OnEndParsing(); //RFC2616 4.4-1
- }
- Content_.append(Data_, DataEnd_);
- Data_ = DataEnd_;
- }
- Parser_ = &THttpParser::ContentParser;
- return false;
- }
- bool THttpParser::ChunkedContentParser() {
- DBGOUT("ReadChunkedContent");
- TChunkInputState& ci = *ChunkInputState_;
- if (Content_.capacity() < static_cast<size_t>(DataEnd_ - Data_)) {
- //try reduce memory reallocations
- Content_.reserve(DataEnd_ - Data_);
- }
- do {
- if (!ci.LeftBytes_) {
- if (Y_UNLIKELY(!ReadLine())) { //read first chunk size or CRLF from prev chunk or CRLF from last chunk
- break;
- }
- if (Y_UNLIKELY(ci.ReadLastChunk_)) {
- return OnEndParsing();
- }
- if (!CurrentLine_) {
- // skip crlf from previous chunk
- if (!ReadLine()) {
- break;
- }
- }
- Y_ENSURE(CurrentLine_.size(), "NEH: LeftBytes hex number cannot be empty. ");
- size_t size = CurrentLine_.find_first_of(" \t;");
- if (size == TString::npos) {
- size = CurrentLine_.size();
- }
- ci.LeftBytes_ = IntFromString<ui32, 16, char>(CurrentLine_.c_str(), size);
- CurrentLine_.remove(0);
- if (!ci.LeftBytes_) { //detectect end of context marker - zero-size chunk, need read CRLF after empty chunk
- ci.ReadLastChunk_ = true;
- if (ReadLine()) {
- return OnEndParsing();
- } else {
- break;
- }
- }
- }
- size_t rd = Min<size_t>(DataEnd_ - Data_, ci.LeftBytes_);
- Content_.append(Data_, rd);
- Data_ += rd;
- ci.LeftBytes_ -= rd;
- } while (Data_ != DataEnd_);
- Parser_ = &THttpParser::ChunkedContentParser;
- return false;
- }
- bool THttpParser::OnEndParsing() {
- Parser_ = &THttpParser::OnEndParsing;
- ExtraDataSize_ = DataEnd_ - Data_;
- return true;
- }
- //continue read to CurrentLine_
- bool THttpParser::ReadLine() {
- TStringBuf in(Data_, DataEnd_);
- size_t endl = in.find('\n');
- if (Y_UNLIKELY(endl == TStringBuf::npos)) {
- //input line not completed
- CurrentLine_.append(Data_, DataEnd_);
- return false;
- }
- CurrentLine_.append(in.data(), endl);
- if (Y_LIKELY(CurrentLine_.size())) {
- //remove '\r' from tail
- size_t withoutCR = CurrentLine_.size() - 1;
- if (CurrentLine_[withoutCR] == '\r') {
- CurrentLine_.remove(withoutCR);
- }
- }
- //Cout << "ReadLine:" << CurrentLine_ << Endl;
- Data_ += endl + 1;
- return true;
- }
- void THttpParser::ParseHttpVersion(TStringBuf httpVersion) {
- if (!httpVersion.StartsWith("HTTP/", 5)) {
- throw yexception() << "expect 'HTTP/'";
- }
- httpVersion.Skip(5);
- {
- TStringBuf major, minor;
- Split(httpVersion, '.', major, minor);
- HttpVersion_.Major = FromString<unsigned>(major);
- HttpVersion_.Minor = FromString<unsigned>(minor);
- if (Y_LIKELY(HttpVersion_.Major > 1 || HttpVersion_.Minor > 0)) {
- // since HTTP/1.1 Keep-Alive is default behaviour
- KeepAlive_ = true;
- }
- }
- }
- void THttpParser::ParseHeaderLine() {
- if (!!HeaderLine_) {
- if (CollectHeaders_) {
- THttpInputHeader hdr(HeaderLine_);
- Headers_.AddHeader(hdr);
- ApplyHeaderLine(hdr.Name(), hdr.Value());
- } else {
- //some dirty optimization (avoid reallocation new strings)
- size_t pos = HeaderLine_.find(':');
- if (pos == TString::npos) {
- ythrow THttpParseException() << "can not parse http header(" << HeaderLine_.Quote() << ")";
- }
- TStringBuf name(StripString(TStringBuf(HeaderLine_.begin(), HeaderLine_.begin() + pos)));
- TStringBuf val(StripString(TStringBuf(HeaderLine_.begin() + pos + 1, HeaderLine_.end())));
- ApplyHeaderLine(name, val);
- }
- HeaderLine_.remove(0);
- }
- }
- void THttpParser::OnEof() {
- if (Parser_ == &THttpParser::ContentParser && !HasContentLength_ && !ChunkInputState_) {
- return; //end of content determined by end of input
- }
- throw THttpException() << TStringBuf("incompleted http response");
- }
- bool THttpParser::DecodeContent() {
- if (!DecodeContent_) {
- return false;
- }
- if (!ContentEncoding_ || ContentEncoding_ == "identity" || ContentEncoding_ == "none") {
- DecodedContent_ = Content_;
- return false;
- }
- TMemoryInput in(Content_.data(), Content_.size());
- if (ContentEncoding_ == "gzip") {
- auto decompressor = TZLibDecompress(&in, ZLib::GZip);
- if (!GzipAllowMultipleStreams_) {
- decompressor.SetAllowMultipleStreams(false);
- }
- DecodedContent_ = decompressor.ReadAll();
- } else if (ContentEncoding_ == "deflate") {
- //https://tools.ietf.org/html/rfc1950
- bool definitelyNoZlibHeader;
- if (Content_.size() < 2) {
- definitelyNoZlibHeader = true;
- } else {
- const ui16 cmf = static_cast<ui8>(Content_[0]);
- const ui16 flg = static_cast<ui8>(Content_[1]);
- definitelyNoZlibHeader = ((cmf << 8) | flg) % 31 != 0;
- }
- try {
- DecodedContent_ = TZLibDecompress(&in, definitelyNoZlibHeader ? ZLib::Raw : ZLib::ZLib).ReadAll();
- }
- catch(...) {
- if (definitelyNoZlibHeader) {
- throw;
- }
- TMemoryInput retryInput(Content_.data(), Content_.size());
- DecodedContent_ = TZLibDecompress(&retryInput, ZLib::Raw).ReadAll();
- }
- } else if (ContentEncoding_.StartsWith("z-")) {
- // opposite for library/cpp/http/io/stream.h
- const NBlockCodecs::ICodec* codec = nullptr;
- try {
- const TStringBuf codecName = TStringBuf(ContentEncoding_).SubStr(2);
- if (codecName.StartsWith("zstd06") || codecName.StartsWith("zstd08")) {
- ythrow NBlockCodecs::TNotFound() << codecName;
- }
- codec = NBlockCodecs::Codec(codecName);
- } catch(const NBlockCodecs::TNotFound& exc) {
- throw THttpParseException() << "Unsupported content-encoding method: " << exc.AsStrBuf();
- }
- NBlockCodecs::TDecodedInput decoder(&in, codec);
- DecodedContent_ = decoder.ReadAll();
- } else if (ContentEncoding_ == "lz4") {
- const auto* codec = NBlockCodecs::Codec(TStringBuf(ContentEncoding_));
- DecodedContent_ = codec->Decode(Content_);
- } else if (ContentEncoding_ == "br") {
- TBrotliDecompress decoder(&in);
- DecodedContent_ = decoder.ReadAll();
- } else {
- throw THttpParseException() << "Unsupported content-encoding method: " << ContentEncoding_;
- }
- return true;
- }
- void THttpParser::ApplyHeaderLine(const TStringBuf& name, const TStringBuf& val) {
- if (AsciiEqualsIgnoreCase(name, TStringBuf("connection"))) {
- KeepAlive_ = AsciiEqualsIgnoreCase(val, TStringBuf("keep-alive"));
- } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-length"))) {
- Y_ENSURE(val.size(), "NEH: Content-Length cannot be empty string. ");
- ContentLength_ = FromString<ui64>(val);
- HasContentLength_ = true;
- } else if (AsciiEqualsIgnoreCase(name, TStringBuf("transfer-encoding"))) {
- if (AsciiEqualsIgnoreCase(val, TStringBuf("chunked"))) {
- ChunkInputState_ = new TChunkInputState();
- }
- } else if (AsciiEqualsIgnoreCase(name, TStringBuf("accept-encoding"))) {
- TStringBuf encodings(val);
- while (encodings.size()) {
- TStringBuf enc = encodings.NextTok(',').After(' ').Before(' ');
- if (!enc) {
- continue;
- }
- TString s(enc);
- s.to_lower();
- AcceptEncodings_.insert(s);
- }
- } else if (AsciiEqualsIgnoreCase(name, TStringBuf("content-encoding"))) {
- TString s(val);
- s.to_lower();
- ContentEncoding_ = s;
- }
- }
|