|
- #include <util/generic/hash.h>
- #include <util/string/cast.h>
- #include <util/generic/hash_set.h>
- #include <util/generic/yexception.h>
- #include "parser.h"
- //#define DEBUG_ME 1
- TCppSaxParser::TText::TText()
- : Offset(0)
- {
- }
- TCppSaxParser::TText::TText(ui64 offset)
- : Offset(offset)
- {
- }
- TCppSaxParser::TText::TText(const TString& data, ui64 offset)
- : Data(data)
- , Offset(offset)
- {
- }
- TCppSaxParser::TText::~TText() = default;
- void TCppSaxParser::TText::Reset() noexcept {
- Offset += Data.length();
- Data.clear();
- }
- TCppSaxParser::TWorker::TWorker() noexcept = default;
- TCppSaxParser::TWorker::~TWorker() = default;
- class TCppSaxParser::TImpl {
- enum EState {
- Code,
- CommentBegin,
- String,
- Character,
- OneLineComment,
- MultiLineComment,
- MultiLineCommentEnd,
- Preprocessor
- };
- public:
- typedef TCppSaxParser::TText TText;
- typedef TCppSaxParser::TWorker TWorker;
- inline TImpl(TWorker* worker)
- : State_(Code)
- , Worker_(worker)
- , SkipNext_(false)
- , Line_(0)
- , Column_(0)
- {
- Worker_->DoStart();
- }
- inline ~TImpl() = default;
- inline void Write(const void* data, size_t len) {
- ProcessInput((const char*)data, len);
- }
- inline void Finish() {
- if (!Text_.Data.empty()) {
- switch (State_) {
- case Code:
- Worker_->DoCode(Text_);
- break;
- case Preprocessor:
- Worker_->DoPreprocessor(Text_);
- break;
- case OneLineComment:
- Worker_->DoOneLineComment(Text_);
- break;
- default:
- ThrowError();
- }
- }
- Worker_->DoEnd();
- }
- private:
- inline void ProcessInput(const char* data, size_t len) {
- EState savedState = Code;
- while (len) {
- const char ch = *data;
- if (ch == '\n') {
- ++Line_;
- Column_ = 0;
- } else {
- ++Column_;
- }
- #if DEBUG_ME
- Cerr << "char: " << ch << Endl;
- Cerr << "state before: " << (unsigned int)State_ << Endl;
- #endif
- retry:
- switch (State_) {
- case Code: {
- savedState = Code;
- switch (ch) {
- case '/':
- State_ = CommentBegin;
- break;
- case '"':
- Action(ch);
- State_ = String;
- break;
- case '\'':
- Action(ch);
- State_ = Character;
- break;
- case '#':
- Action(ch);
- State_ = Preprocessor;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case CommentBegin: {
- switch (ch) {
- case '/':
- State_ = savedState;
- savedState = Code;
- Action("//");
- State_ = OneLineComment;
- break;
- case '*':
- State_ = savedState;
- Action("/*");
- State_ = MultiLineComment;
- break;
- default:
- Text_.Data += '/';
- State_ = savedState;
- goto retry;
- }
- break;
- }
- case OneLineComment: {
- switch (ch) {
- case '\n':
- Action(ch);
- State_ = Code;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case MultiLineComment: {
- switch (ch) {
- case '*':
- Text_.Data += ch;
- State_ = MultiLineCommentEnd;
- break;
- case '\n':
- Text_.Data += ch;
- savedState = Code;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case MultiLineCommentEnd: {
- switch (ch) {
- case '/':
- Text_.Data += ch;
- Action();
- State_ = savedState;
- break;
- default:
- State_ = MultiLineComment;
- goto retry;
- }
- break;
- }
- case String: {
- switch (ch) {
- case '"':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- if (savedState == Code) {
- Action();
- }
- State_ = savedState;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = !SkipNext_;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- case Character: {
- switch (ch) {
- case '\'':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- if (savedState == Code) {
- Action();
- }
- State_ = savedState;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = !SkipNext_;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- case Preprocessor: {
- savedState = Preprocessor;
- switch (ch) {
- case '/':
- State_ = CommentBegin;
- break;
- case '\'':
- Text_.Data += ch;
- State_ = Character;
- break;
- case '"':
- Text_.Data += ch;
- State_ = String;
- break;
- case '\n':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- Action();
- savedState = Code;
- State_ = Code;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = true;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- default:
- ThrowError();
- }
- #if DEBUG_ME
- Cerr << "state after: " << (unsigned int)State_ << Endl;
- #endif
- ++data;
- --len;
- }
- }
- inline void Action(char ch) {
- Action();
- Text_.Data += ch;
- }
- inline void Action(const char* st) {
- Action();
- Text_.Data += st;
- }
- inline void Action() {
- switch (State_) {
- case Code:
- Worker_->DoCode(Text_);
- break;
- case OneLineComment:
- Worker_->DoOneLineComment(Text_);
- break;
- case MultiLineCommentEnd:
- Worker_->DoMultiLineComment(Text_);
- break;
- case Preprocessor:
- Worker_->DoPreprocessor(Text_);
- break;
- case String:
- Worker_->DoString(Text_);
- break;
- case Character:
- Worker_->DoCharacter(Text_);
- break;
- default:
- ThrowError();
- }
- Text_.Reset();
- }
- inline void ThrowError() const {
- ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")";
- }
- private:
- EState State_;
- TWorker* Worker_;
- TText Text_;
- bool SkipNext_;
- ui64 Line_;
- ui64 Column_;
- };
- TCppSaxParser::TCppSaxParser(TWorker* worker)
- : Impl_(new TImpl(worker))
- {
- }
- TCppSaxParser::~TCppSaxParser() = default;
- void TCppSaxParser::DoWrite(const void* data, size_t len) {
- Impl_->Write(data, len);
- }
- void TCppSaxParser::DoFinish() {
- Impl_->Finish();
- }
- TCppSimpleSax::TCppSimpleSax() noexcept {
- }
- TCppSimpleSax::~TCppSimpleSax() = default;
- void TCppSimpleSax::DoCode(const TText& text) {
- static const char char_types[] = {
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
- 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1,
- 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
- static const char CWHITESPACE = 0;
- static const char CIDENTIFIER = 1;
- static const char CSYNTAX = 2;
- enum EState {
- WhiteSpace = CWHITESPACE,
- Identifier = CIDENTIFIER,
- Syntax = CSYNTAX
- };
- EState state = Identifier;
- TText cur(text.Offset);
- for (const auto& it : text.Data) {
- const unsigned char ch = *(const unsigned char*)(&it);
- const char type = char_types[ch];
- switch (state) {
- case Identifier: {
- switch (type) {
- case CIDENTIFIER:
- cur.Data += ch;
- break;
- default:
- if (!cur.Data.empty()) {
- DoIdentifier(cur);
- }
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- case WhiteSpace: {
- switch (type) {
- case CWHITESPACE:
- cur.Data += ch;
- break;
- default:
- DoWhiteSpace(cur);
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- case Syntax: {
- switch (type) {
- case CSYNTAX:
- cur.Data += ch;
- break;
- default:
- DoSyntax(cur);
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- }
- }
- if (!cur.Data.empty()) {
- switch (state) {
- case Identifier:
- DoIdentifier(cur);
- break;
- case WhiteSpace:
- DoWhiteSpace(cur);
- break;
- case Syntax:
- DoSyntax(cur);
- break;
- }
- }
- }
- class TCppFullSax::TImpl {
- typedef THashSet<TString> TKeyWords;
- class TRegExp {
- public:
- inline TRegExp(const char*) {
- }
- inline bool Match(const TString& /*s*/) const noexcept {
- return false;
- }
- };
- public:
- inline TImpl()
- : OctNumber_("^[+-]?0[0-7]+$")
- , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$")
- , DecNumber_("^[+-]?[0-9]+$")
- , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$")
- {
- AddKeyword("extern");
- AddKeyword("static");
- AddKeyword("inline");
- AddKeyword("volatile");
- AddKeyword("asm");
- AddKeyword("const");
- AddKeyword("mutable");
- AddKeyword("char");
- AddKeyword("signed");
- AddKeyword("unsigned");
- AddKeyword("int");
- AddKeyword("short");
- AddKeyword("long");
- AddKeyword("double");
- AddKeyword("float");
- AddKeyword("bool");
- AddKeyword("class");
- AddKeyword("struct");
- AddKeyword("union");
- AddKeyword("void");
- AddKeyword("auto");
- AddKeyword("throw");
- AddKeyword("try");
- AddKeyword("catch");
- AddKeyword("for");
- AddKeyword("do");
- AddKeyword("if");
- AddKeyword("else");
- AddKeyword("while");
- AddKeyword("switch");
- AddKeyword("case");
- AddKeyword("default");
- AddKeyword("goto");
- AddKeyword("break");
- AddKeyword("continue");
- AddKeyword("virtual");
- AddKeyword("template");
- AddKeyword("typename");
- AddKeyword("enum");
- AddKeyword("public");
- AddKeyword("private");
- AddKeyword("protected");
- AddKeyword("using");
- AddKeyword("namespace");
- AddKeyword("typedef");
- AddKeyword("true");
- AddKeyword("false");
- AddKeyword("return");
- AddKeyword("new");
- AddKeyword("delete");
- AddKeyword("operator");
- AddKeyword("friend");
- AddKeyword("this");
- }
- inline ~TImpl() = default;
- inline void AddKeyword(const TString& keyword) {
- KeyWords_.insert(keyword);
- }
- inline bool IsKeyword(const TString& s) {
- return KeyWords_.find(s) != KeyWords_.end();
- }
- inline bool IsOctNumber(const TString& s) {
- return OctNumber_.Match(s);
- }
- inline bool IsHexNumber(const TString& s) {
- return HexNumber_.Match(s);
- }
- inline bool IsDecNumber(const TString& s) {
- return DecNumber_.Match(s);
- }
- inline bool IsFloatNumber(const TString& s) {
- return FltNumber_.Match(s);
- }
- private:
- const TRegExp OctNumber_;
- const TRegExp HexNumber_;
- const TRegExp DecNumber_;
- const TRegExp FltNumber_;
- TKeyWords KeyWords_;
- };
- TCppFullSax::TCppFullSax()
- : Impl_(new TImpl())
- {
- }
- TCppFullSax::~TCppFullSax() = default;
- void TCppFullSax::AddKeyword(const TString& keyword) {
- Impl_->AddKeyword(keyword);
- }
- void TCppFullSax::DoIdentifier(const TText& text) {
- if (Impl_->IsKeyword(text.Data)) {
- DoKeyword(text);
- } else if (Impl_->IsOctNumber(text.Data)) {
- DoOctNumber(text);
- } else if (Impl_->IsHexNumber(text.Data)) {
- DoHexNumber(text);
- } else if (Impl_->IsDecNumber(text.Data)) {
- DoDecNumber(text);
- } else if (Impl_->IsFloatNumber(text.Data)) {
- DoFloatNumber(text);
- } else {
- DoName(text);
- }
- }
- void TCppFullSax::DoEnd() {
- }
- void TCppFullSax::DoStart() {
- }
- void TCppFullSax::DoString(const TText&) {
- }
- void TCppFullSax::DoCharacter(const TText&) {
- }
- void TCppFullSax::DoWhiteSpace(const TText&) {
- }
- void TCppFullSax::DoKeyword(const TText&) {
- }
- void TCppFullSax::DoName(const TText&) {
- }
- void TCppFullSax::DoOctNumber(const TText&) {
- }
- void TCppFullSax::DoHexNumber(const TText&) {
- }
- void TCppFullSax::DoDecNumber(const TText&) {
- }
- void TCppFullSax::DoFloatNumber(const TText&) {
- }
- void TCppFullSax::DoSyntax(const TText&) {
- }
- void TCppFullSax::DoOneLineComment(const TText&) {
- }
- void TCppFullSax::DoMultiLineComment(const TText&) {
- }
- void TCppFullSax::DoPreprocessor(const TText&) {
- }
|