123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739 |
- #include <util/generic/hash.h>
- #include <util/string/cast.h>
- #include <util/generic/hash_set.h>
- #include <util/generic/yexception.h>
- #include "parser.h"
- //#define DEBUG_ME 1
- TCppSaxParser::TText::TText()
- : Offset(0)
- {
- }
- TCppSaxParser::TText::TText(ui64 offset)
- : Offset(offset)
- {
- }
- TCppSaxParser::TText::TText(const TString& data, ui64 offset)
- : Data(data)
- , Offset(offset)
- {
- }
- TCppSaxParser::TText::~TText() = default;
- void TCppSaxParser::TText::Reset() noexcept {
- Offset += Data.length();
- Data.clear();
- }
- TCppSaxParser::TWorker::TWorker() noexcept = default;
- TCppSaxParser::TWorker::~TWorker() = default;
- class TCppSaxParser::TImpl {
- enum EState {
- Code,
- CommentBegin,
- String,
- Character,
- OneLineComment,
- MultiLineComment,
- MultiLineCommentEnd,
- Preprocessor
- };
- public:
- typedef TCppSaxParser::TText TText;
- typedef TCppSaxParser::TWorker TWorker;
- inline TImpl(TWorker* worker)
- : State_(Code)
- , Worker_(worker)
- , SkipNext_(false)
- , Line_(0)
- , Column_(0)
- {
- Worker_->DoStart();
- }
- inline ~TImpl() = default;
- inline void Write(const void* data, size_t len) {
- ProcessInput((const char*)data, len);
- }
- inline void Finish() {
- if (!Text_.Data.empty()) {
- switch (State_) {
- case Code:
- Worker_->DoCode(Text_);
- break;
- case Preprocessor:
- Worker_->DoPreprocessor(Text_);
- break;
- case OneLineComment:
- Worker_->DoOneLineComment(Text_);
- break;
- default:
- ThrowError();
- }
- }
- Worker_->DoEnd();
- }
- private:
- inline void ProcessInput(const char* data, size_t len) {
- EState savedState = Code;
- while (len) {
- const char ch = *data;
- if (ch == '\n') {
- ++Line_;
- Column_ = 0;
- } else {
- ++Column_;
- }
- #if DEBUG_ME
- Cerr << "char: " << ch << Endl;
- Cerr << "state before: " << (unsigned int)State_ << Endl;
- #endif
- retry:
- switch (State_) {
- case Code: {
- savedState = Code;
- switch (ch) {
- case '/':
- State_ = CommentBegin;
- break;
- case '"':
- Action(ch);
- State_ = String;
- break;
- case '\'':
- Action(ch);
- State_ = Character;
- break;
- case '#':
- Action(ch);
- State_ = Preprocessor;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case CommentBegin: {
- switch (ch) {
- case '/':
- State_ = savedState;
- savedState = Code;
- Action("//");
- State_ = OneLineComment;
- break;
- case '*':
- State_ = savedState;
- Action("/*");
- State_ = MultiLineComment;
- break;
- default:
- Text_.Data += '/';
- State_ = savedState;
- goto retry;
- }
- break;
- }
- case OneLineComment: {
- switch (ch) {
- case '\n':
- Action(ch);
- State_ = Code;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case MultiLineComment: {
- switch (ch) {
- case '*':
- Text_.Data += ch;
- State_ = MultiLineCommentEnd;
- break;
- case '\n':
- Text_.Data += ch;
- savedState = Code;
- break;
- default:
- Text_.Data += ch;
- break;
- }
- break;
- }
- case MultiLineCommentEnd: {
- switch (ch) {
- case '/':
- Text_.Data += ch;
- Action();
- State_ = savedState;
- break;
- default:
- State_ = MultiLineComment;
- goto retry;
- }
- break;
- }
- case String: {
- switch (ch) {
- case '"':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- if (savedState == Code) {
- Action();
- }
- State_ = savedState;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = !SkipNext_;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- case Character: {
- switch (ch) {
- case '\'':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- if (savedState == Code) {
- Action();
- }
- State_ = savedState;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = !SkipNext_;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- case Preprocessor: {
- savedState = Preprocessor;
- switch (ch) {
- case '/':
- State_ = CommentBegin;
- break;
- case '\'':
- Text_.Data += ch;
- State_ = Character;
- break;
- case '"':
- Text_.Data += ch;
- State_ = String;
- break;
- case '\n':
- Text_.Data += ch;
- if (SkipNext_) {
- SkipNext_ = false;
- } else {
- Action();
- savedState = Code;
- State_ = Code;
- }
- break;
- case '\\':
- Text_.Data += ch;
- SkipNext_ = true;
- break;
- default:
- Text_.Data += ch;
- SkipNext_ = false;
- break;
- }
- break;
- }
- default:
- ThrowError();
- }
- #if DEBUG_ME
- Cerr << "state after: " << (unsigned int)State_ << Endl;
- #endif
- ++data;
- --len;
- }
- }
- inline void Action(char ch) {
- Action();
- Text_.Data += ch;
- }
- inline void Action(const char* st) {
- Action();
- Text_.Data += st;
- }
- inline void Action() {
- switch (State_) {
- case Code:
- Worker_->DoCode(Text_);
- break;
- case OneLineComment:
- Worker_->DoOneLineComment(Text_);
- break;
- case MultiLineCommentEnd:
- Worker_->DoMultiLineComment(Text_);
- break;
- case Preprocessor:
- Worker_->DoPreprocessor(Text_);
- break;
- case String:
- Worker_->DoString(Text_);
- break;
- case Character:
- Worker_->DoCharacter(Text_);
- break;
- default:
- ThrowError();
- }
- Text_.Reset();
- }
- inline void ThrowError() const {
- ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")";
- }
- private:
- EState State_;
- TWorker* Worker_;
- TText Text_;
- bool SkipNext_;
- ui64 Line_;
- ui64 Column_;
- };
- TCppSaxParser::TCppSaxParser(TWorker* worker)
- : Impl_(new TImpl(worker))
- {
- }
- TCppSaxParser::~TCppSaxParser() = default;
- void TCppSaxParser::DoWrite(const void* data, size_t len) {
- Impl_->Write(data, len);
- }
- void TCppSaxParser::DoFinish() {
- Impl_->Finish();
- }
- TCppSimpleSax::TCppSimpleSax() noexcept {
- }
- TCppSimpleSax::~TCppSimpleSax() = default;
- void TCppSimpleSax::DoCode(const TText& text) {
- static const char char_types[] = {
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
- 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1,
- 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
- static const char CWHITESPACE = 0;
- static const char CIDENTIFIER = 1;
- static const char CSYNTAX = 2;
- enum EState {
- WhiteSpace = CWHITESPACE,
- Identifier = CIDENTIFIER,
- Syntax = CSYNTAX
- };
- EState state = Identifier;
- TText cur(text.Offset);
- for (const auto& it : text.Data) {
- const unsigned char ch = *(const unsigned char*)(&it);
- const char type = char_types[ch];
- switch (state) {
- case Identifier: {
- switch (type) {
- case CIDENTIFIER:
- cur.Data += ch;
- break;
- default:
- if (!cur.Data.empty()) {
- DoIdentifier(cur);
- }
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- case WhiteSpace: {
- switch (type) {
- case CWHITESPACE:
- cur.Data += ch;
- break;
- default:
- DoWhiteSpace(cur);
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- case Syntax: {
- switch (type) {
- case CSYNTAX:
- cur.Data += ch;
- break;
- default:
- DoSyntax(cur);
- cur.Reset();
- cur.Data += ch;
- state = (EState)type;
- break;
- }
- break;
- }
- }
- }
- if (!cur.Data.empty()) {
- switch (state) {
- case Identifier:
- DoIdentifier(cur);
- break;
- case WhiteSpace:
- DoWhiteSpace(cur);
- break;
- case Syntax:
- DoSyntax(cur);
- break;
- }
- }
- }
- class TCppFullSax::TImpl {
- typedef THashSet<TString> TKeyWords;
- class TRegExp {
- public:
- inline TRegExp(const char*) {
- }
- inline bool Match(const TString& /*s*/) const noexcept {
- return false;
- }
- };
- public:
- inline TImpl()
- : OctNumber_("^[+-]?0[0-7]+$")
- , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$")
- , DecNumber_("^[+-]?[0-9]+$")
- , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$")
- {
- AddKeyword("extern");
- AddKeyword("static");
- AddKeyword("inline");
- AddKeyword("volatile");
- AddKeyword("asm");
- AddKeyword("const");
- AddKeyword("mutable");
- AddKeyword("char");
- AddKeyword("signed");
- AddKeyword("unsigned");
- AddKeyword("int");
- AddKeyword("short");
- AddKeyword("long");
- AddKeyword("double");
- AddKeyword("float");
- AddKeyword("bool");
- AddKeyword("class");
- AddKeyword("struct");
- AddKeyword("union");
- AddKeyword("void");
- AddKeyword("auto");
- AddKeyword("throw");
- AddKeyword("try");
- AddKeyword("catch");
- AddKeyword("for");
- AddKeyword("do");
- AddKeyword("if");
- AddKeyword("else");
- AddKeyword("while");
- AddKeyword("switch");
- AddKeyword("case");
- AddKeyword("default");
- AddKeyword("goto");
- AddKeyword("break");
- AddKeyword("continue");
- AddKeyword("virtual");
- AddKeyword("template");
- AddKeyword("typename");
- AddKeyword("enum");
- AddKeyword("public");
- AddKeyword("private");
- AddKeyword("protected");
- AddKeyword("using");
- AddKeyword("namespace");
- AddKeyword("typedef");
- AddKeyword("true");
- AddKeyword("false");
- AddKeyword("return");
- AddKeyword("new");
- AddKeyword("delete");
- AddKeyword("operator");
- AddKeyword("friend");
- AddKeyword("this");
- }
- inline ~TImpl() = default;
- inline void AddKeyword(const TString& keyword) {
- KeyWords_.insert(keyword);
- }
- inline bool IsKeyword(const TString& s) {
- return KeyWords_.find(s) != KeyWords_.end();
- }
- inline bool IsOctNumber(const TString& s) {
- return OctNumber_.Match(s);
- }
- inline bool IsHexNumber(const TString& s) {
- return HexNumber_.Match(s);
- }
- inline bool IsDecNumber(const TString& s) {
- return DecNumber_.Match(s);
- }
- inline bool IsFloatNumber(const TString& s) {
- return FltNumber_.Match(s);
- }
- private:
- const TRegExp OctNumber_;
- const TRegExp HexNumber_;
- const TRegExp DecNumber_;
- const TRegExp FltNumber_;
- TKeyWords KeyWords_;
- };
- TCppFullSax::TCppFullSax()
- : Impl_(new TImpl())
- {
- }
- TCppFullSax::~TCppFullSax() = default;
- void TCppFullSax::AddKeyword(const TString& keyword) {
- Impl_->AddKeyword(keyword);
- }
- void TCppFullSax::DoIdentifier(const TText& text) {
- if (Impl_->IsKeyword(text.Data)) {
- DoKeyword(text);
- } else if (Impl_->IsOctNumber(text.Data)) {
- DoOctNumber(text);
- } else if (Impl_->IsHexNumber(text.Data)) {
- DoHexNumber(text);
- } else if (Impl_->IsDecNumber(text.Data)) {
- DoDecNumber(text);
- } else if (Impl_->IsFloatNumber(text.Data)) {
- DoFloatNumber(text);
- } else {
- DoName(text);
- }
- }
- void TCppFullSax::DoEnd() {
- }
- void TCppFullSax::DoStart() {
- }
- void TCppFullSax::DoString(const TText&) {
- }
- void TCppFullSax::DoCharacter(const TText&) {
- }
- void TCppFullSax::DoWhiteSpace(const TText&) {
- }
- void TCppFullSax::DoKeyword(const TText&) {
- }
- void TCppFullSax::DoName(const TText&) {
- }
- void TCppFullSax::DoOctNumber(const TText&) {
- }
- void TCppFullSax::DoHexNumber(const TText&) {
- }
- void TCppFullSax::DoDecNumber(const TText&) {
- }
- void TCppFullSax::DoFloatNumber(const TText&) {
- }
- void TCppFullSax::DoSyntax(const TText&) {
- }
- void TCppFullSax::DoOneLineComment(const TText&) {
- }
- void TCppFullSax::DoMultiLineComment(const TText&) {
- }
- void TCppFullSax::DoPreprocessor(const TText&) {
- }
|