parser.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773
  1. #include <util/generic/hash.h>
  2. #include <util/string/ascii.h>
  3. #include <util/string/cast.h>
  4. #include <util/generic/hash_set.h>
  5. #include <util/generic/yexception.h>
  6. #include "parser.h"
  7. //#define DEBUG_ME 1
  8. TCppSaxParser::TText::TText()
  9. : Offset(0)
  10. {
  11. }
  12. TCppSaxParser::TText::TText(ui64 offset)
  13. : Offset(offset)
  14. {
  15. }
  16. TCppSaxParser::TText::TText(const TString& data, ui64 offset)
  17. : Data(data)
  18. , Offset(offset)
  19. {
  20. }
  21. TCppSaxParser::TText::~TText() = default;
  22. void TCppSaxParser::TText::Reset() noexcept {
  23. Offset += Data.length();
  24. Data.clear();
  25. }
  26. TCppSaxParser::TWorker::TWorker() noexcept = default;
  27. TCppSaxParser::TWorker::~TWorker() = default;
  28. class TCppSaxParser::TImpl {
  29. enum EState {
  30. Code,
  31. CommentBegin,
  32. String,
  33. Character,
  34. OneLineComment,
  35. MultiLineComment,
  36. MultiLineCommentEnd,
  37. Preprocessor
  38. };
  39. public:
  40. typedef TCppSaxParser::TText TText;
  41. typedef TCppSaxParser::TWorker TWorker;
  42. inline TImpl(TWorker* worker)
  43. : State_(Code)
  44. , Worker_(worker)
  45. , SkipNext_(false)
  46. , Line_(0)
  47. , Column_(0)
  48. {
  49. Worker_->DoStart();
  50. }
  51. inline ~TImpl() = default;
  52. inline void Write(const void* data, size_t len) {
  53. ProcessInput((const char*)data, len);
  54. }
  55. inline void Finish() {
  56. if (!Text_.Data.empty()) {
  57. switch (State_) {
  58. case Code:
  59. Worker_->DoCode(Text_);
  60. break;
  61. case Preprocessor:
  62. Worker_->DoPreprocessor(Text_);
  63. break;
  64. case OneLineComment:
  65. Worker_->DoOneLineComment(Text_);
  66. break;
  67. default:
  68. ThrowError();
  69. }
  70. }
  71. Worker_->DoEnd();
  72. }
  73. private:
  74. inline void ProcessInput(const char* data, size_t len) {
  75. EState savedState = Code;
  76. while (len) {
  77. const char ch = *data;
  78. if (ch == '\n') {
  79. ++Line_;
  80. Column_ = 0;
  81. } else {
  82. ++Column_;
  83. }
  84. #if DEBUG_ME
  85. Cerr << "char: " << ch << Endl;
  86. Cerr << "state before: " << (unsigned int)State_ << Endl;
  87. #endif
  88. retry:
  89. switch (State_) {
  90. case Code: {
  91. savedState = Code;
  92. switch (ch) {
  93. case '/':
  94. State_ = CommentBegin;
  95. break;
  96. case '"':
  97. Action(ch);
  98. State_ = String;
  99. break;
  100. case '\'':
  101. if (QuoteCharIsADigitSeparator()) {
  102. Text_.Data += ch;
  103. break;
  104. }
  105. Action(ch);
  106. State_ = Character;
  107. break;
  108. case '#':
  109. Action(ch);
  110. State_ = Preprocessor;
  111. break;
  112. default:
  113. Text_.Data += ch;
  114. break;
  115. }
  116. break;
  117. }
  118. case CommentBegin: {
  119. switch (ch) {
  120. case '/':
  121. State_ = savedState;
  122. savedState = Code;
  123. Action("//");
  124. State_ = OneLineComment;
  125. break;
  126. case '*':
  127. State_ = savedState;
  128. Action("/*");
  129. State_ = MultiLineComment;
  130. break;
  131. default:
  132. Text_.Data += '/';
  133. State_ = savedState;
  134. goto retry;
  135. }
  136. break;
  137. }
  138. case OneLineComment: {
  139. switch (ch) {
  140. case '\n':
  141. Action(ch);
  142. State_ = Code;
  143. break;
  144. default:
  145. Text_.Data += ch;
  146. break;
  147. }
  148. break;
  149. }
  150. case MultiLineComment: {
  151. switch (ch) {
  152. case '*':
  153. Text_.Data += ch;
  154. State_ = MultiLineCommentEnd;
  155. break;
  156. case '\n':
  157. Text_.Data += ch;
  158. savedState = Code;
  159. break;
  160. default:
  161. Text_.Data += ch;
  162. break;
  163. }
  164. break;
  165. }
  166. case MultiLineCommentEnd: {
  167. switch (ch) {
  168. case '/':
  169. Text_.Data += ch;
  170. Action();
  171. State_ = savedState;
  172. break;
  173. default:
  174. State_ = MultiLineComment;
  175. goto retry;
  176. }
  177. break;
  178. }
  179. case String: {
  180. switch (ch) {
  181. case '"':
  182. Text_.Data += ch;
  183. if (SkipNext_) {
  184. SkipNext_ = false;
  185. } else {
  186. if (savedState == Code) {
  187. Action();
  188. }
  189. State_ = savedState;
  190. }
  191. break;
  192. case '\\':
  193. Text_.Data += ch;
  194. SkipNext_ = !SkipNext_;
  195. break;
  196. default:
  197. Text_.Data += ch;
  198. SkipNext_ = false;
  199. break;
  200. }
  201. break;
  202. }
  203. case Character: {
  204. switch (ch) {
  205. case '\'':
  206. Text_.Data += ch;
  207. if (SkipNext_) {
  208. SkipNext_ = false;
  209. } else {
  210. if (savedState == Code) {
  211. Action();
  212. }
  213. State_ = savedState;
  214. }
  215. break;
  216. case '\\':
  217. Text_.Data += ch;
  218. SkipNext_ = !SkipNext_;
  219. break;
  220. default:
  221. Text_.Data += ch;
  222. SkipNext_ = false;
  223. break;
  224. }
  225. break;
  226. }
  227. case Preprocessor: {
  228. savedState = Preprocessor;
  229. switch (ch) {
  230. case '/':
  231. State_ = CommentBegin;
  232. break;
  233. case '\'':
  234. Text_.Data += ch;
  235. State_ = Character;
  236. break;
  237. case '"':
  238. Text_.Data += ch;
  239. State_ = String;
  240. break;
  241. case '\n':
  242. Text_.Data += ch;
  243. if (SkipNext_) {
  244. SkipNext_ = false;
  245. } else {
  246. Action();
  247. savedState = Code;
  248. State_ = Code;
  249. }
  250. break;
  251. case '\\':
  252. Text_.Data += ch;
  253. SkipNext_ = true;
  254. break;
  255. default:
  256. Text_.Data += ch;
  257. SkipNext_ = false;
  258. break;
  259. }
  260. break;
  261. }
  262. default:
  263. ThrowError();
  264. }
  265. #if DEBUG_ME
  266. Cerr << "state after: " << (unsigned int)State_ << Endl;
  267. #endif
  268. ++data;
  269. --len;
  270. }
  271. }
  272. // digit separator in integral literal (ex. 73'709'550'592)
  273. bool QuoteCharIsADigitSeparator() const {
  274. const TStringBuf data = Text_.Data;
  275. if (data.empty()) {
  276. return false;
  277. }
  278. if (!IsAsciiHex(data.back())) {
  279. return false;
  280. }
  281. // check for char literal prefix (ex. `u8'$'`)
  282. static constexpr TStringBuf literalPrefixes[] {
  283. "u8",
  284. "u",
  285. "U",
  286. "L",
  287. };
  288. for (const TStringBuf& literalPrefix : literalPrefixes) {
  289. if (TStringBuf prev; data.BeforeSuffix(literalPrefix, prev)) {
  290. if (!prev.empty() && (IsAsciiAlnum(prev.back()) || prev.back() == '_' || prev.back() == '$')) {
  291. // some macro name ends with an `u8` sequence
  292. continue;
  293. }
  294. // it is a prefixed character literal
  295. return false;
  296. }
  297. }
  298. return true;
  299. }
  300. inline void Action(char ch) {
  301. Action();
  302. Text_.Data += ch;
  303. }
  304. inline void Action(const char* st) {
  305. Action();
  306. Text_.Data += st;
  307. }
  308. inline void Action() {
  309. switch (State_) {
  310. case Code:
  311. Worker_->DoCode(Text_);
  312. break;
  313. case OneLineComment:
  314. Worker_->DoOneLineComment(Text_);
  315. break;
  316. case MultiLineCommentEnd:
  317. Worker_->DoMultiLineComment(Text_);
  318. break;
  319. case Preprocessor:
  320. Worker_->DoPreprocessor(Text_);
  321. break;
  322. case String:
  323. Worker_->DoString(Text_);
  324. break;
  325. case Character:
  326. Worker_->DoCharacter(Text_);
  327. break;
  328. default:
  329. ThrowError();
  330. }
  331. Text_.Reset();
  332. }
  333. inline void ThrowError() const {
  334. ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")";
  335. }
  336. private:
  337. EState State_;
  338. TWorker* Worker_;
  339. TText Text_;
  340. bool SkipNext_;
  341. ui64 Line_;
  342. ui64 Column_;
  343. };
  344. TCppSaxParser::TCppSaxParser(TWorker* worker)
  345. : Impl_(new TImpl(worker))
  346. {
  347. }
  348. TCppSaxParser::~TCppSaxParser() = default;
  349. void TCppSaxParser::DoWrite(const void* data, size_t len) {
  350. Impl_->Write(data, len);
  351. }
  352. void TCppSaxParser::DoFinish() {
  353. Impl_->Finish();
  354. }
  355. TCppSimpleSax::TCppSimpleSax() noexcept {
  356. }
  357. TCppSimpleSax::~TCppSimpleSax() = default;
  358. void TCppSimpleSax::DoCode(const TText& text) {
  359. static const char char_types[] = {
  360. 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2,
  361. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  362. 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  363. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
  364. 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  365. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1,
  366. 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  367. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
  368. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  369. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  370. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  371. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  372. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  373. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  374. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  375. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
  376. static const char CWHITESPACE = 0;
  377. static const char CIDENTIFIER = 1;
  378. static const char CSYNTAX = 2;
  379. enum EState {
  380. WhiteSpace = CWHITESPACE,
  381. Identifier = CIDENTIFIER,
  382. Syntax = CSYNTAX
  383. };
  384. EState state = Identifier;
  385. TText cur(text.Offset);
  386. for (const auto& it : text.Data) {
  387. const unsigned char ch = *(const unsigned char*)(&it);
  388. const char type = char_types[ch];
  389. switch (state) {
  390. case Identifier: {
  391. switch (type) {
  392. case CIDENTIFIER:
  393. cur.Data += ch;
  394. break;
  395. default:
  396. if (!cur.Data.empty()) {
  397. DoIdentifier(cur);
  398. }
  399. cur.Reset();
  400. cur.Data += ch;
  401. state = (EState)type;
  402. break;
  403. }
  404. break;
  405. }
  406. case WhiteSpace: {
  407. switch (type) {
  408. case CWHITESPACE:
  409. cur.Data += ch;
  410. break;
  411. default:
  412. DoWhiteSpace(cur);
  413. cur.Reset();
  414. cur.Data += ch;
  415. state = (EState)type;
  416. break;
  417. }
  418. break;
  419. }
  420. case Syntax: {
  421. switch (type) {
  422. case CSYNTAX:
  423. cur.Data += ch;
  424. break;
  425. default:
  426. DoSyntax(cur);
  427. cur.Reset();
  428. cur.Data += ch;
  429. state = (EState)type;
  430. break;
  431. }
  432. break;
  433. }
  434. }
  435. }
  436. if (!cur.Data.empty()) {
  437. switch (state) {
  438. case Identifier:
  439. DoIdentifier(cur);
  440. break;
  441. case WhiteSpace:
  442. DoWhiteSpace(cur);
  443. break;
  444. case Syntax:
  445. DoSyntax(cur);
  446. break;
  447. }
  448. }
  449. }
  450. class TCppFullSax::TImpl {
  451. typedef THashSet<TString> TKeyWords;
  452. class TRegExp {
  453. public:
  454. inline TRegExp(const char*) {
  455. }
  456. inline bool Match(const TString& /*s*/) const noexcept {
  457. return false;
  458. }
  459. };
  460. public:
  461. inline TImpl()
  462. : OctNumber_("^[+-]?0[0-7]+$")
  463. , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$")
  464. , DecNumber_("^[+-]?[0-9]+$")
  465. , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$")
  466. {
  467. AddKeyword("extern");
  468. AddKeyword("static");
  469. AddKeyword("inline");
  470. AddKeyword("volatile");
  471. AddKeyword("asm");
  472. AddKeyword("const");
  473. AddKeyword("mutable");
  474. AddKeyword("char");
  475. AddKeyword("signed");
  476. AddKeyword("unsigned");
  477. AddKeyword("int");
  478. AddKeyword("short");
  479. AddKeyword("long");
  480. AddKeyword("double");
  481. AddKeyword("float");
  482. AddKeyword("bool");
  483. AddKeyword("class");
  484. AddKeyword("struct");
  485. AddKeyword("union");
  486. AddKeyword("void");
  487. AddKeyword("auto");
  488. AddKeyword("throw");
  489. AddKeyword("try");
  490. AddKeyword("catch");
  491. AddKeyword("for");
  492. AddKeyword("do");
  493. AddKeyword("if");
  494. AddKeyword("else");
  495. AddKeyword("while");
  496. AddKeyword("switch");
  497. AddKeyword("case");
  498. AddKeyword("default");
  499. AddKeyword("goto");
  500. AddKeyword("break");
  501. AddKeyword("continue");
  502. AddKeyword("virtual");
  503. AddKeyword("template");
  504. AddKeyword("typename");
  505. AddKeyword("enum");
  506. AddKeyword("public");
  507. AddKeyword("private");
  508. AddKeyword("protected");
  509. AddKeyword("using");
  510. AddKeyword("namespace");
  511. AddKeyword("typedef");
  512. AddKeyword("true");
  513. AddKeyword("false");
  514. AddKeyword("return");
  515. AddKeyword("new");
  516. AddKeyword("delete");
  517. AddKeyword("operator");
  518. AddKeyword("friend");
  519. AddKeyword("this");
  520. }
  521. inline ~TImpl() = default;
  522. inline void AddKeyword(const TString& keyword) {
  523. KeyWords_.insert(keyword);
  524. }
  525. inline bool IsKeyword(const TString& s) {
  526. return KeyWords_.find(s) != KeyWords_.end();
  527. }
  528. inline bool IsOctNumber(const TString& s) {
  529. return OctNumber_.Match(s);
  530. }
  531. inline bool IsHexNumber(const TString& s) {
  532. return HexNumber_.Match(s);
  533. }
  534. inline bool IsDecNumber(const TString& s) {
  535. return DecNumber_.Match(s);
  536. }
  537. inline bool IsFloatNumber(const TString& s) {
  538. return FltNumber_.Match(s);
  539. }
  540. private:
  541. const TRegExp OctNumber_;
  542. const TRegExp HexNumber_;
  543. const TRegExp DecNumber_;
  544. const TRegExp FltNumber_;
  545. TKeyWords KeyWords_;
  546. };
  547. TCppFullSax::TCppFullSax()
  548. : Impl_(new TImpl())
  549. {
  550. }
  551. TCppFullSax::~TCppFullSax() = default;
  552. void TCppFullSax::AddKeyword(const TString& keyword) {
  553. Impl_->AddKeyword(keyword);
  554. }
  555. void TCppFullSax::DoIdentifier(const TText& text) {
  556. if (Impl_->IsKeyword(text.Data)) {
  557. DoKeyword(text);
  558. } else if (Impl_->IsOctNumber(text.Data)) {
  559. DoOctNumber(text);
  560. } else if (Impl_->IsHexNumber(text.Data)) {
  561. DoHexNumber(text);
  562. } else if (Impl_->IsDecNumber(text.Data)) {
  563. DoDecNumber(text);
  564. } else if (Impl_->IsFloatNumber(text.Data)) {
  565. DoFloatNumber(text);
  566. } else {
  567. DoName(text);
  568. }
  569. }
  570. void TCppFullSax::DoEnd() {
  571. }
  572. void TCppFullSax::DoStart() {
  573. }
  574. void TCppFullSax::DoString(const TText&) {
  575. }
  576. void TCppFullSax::DoCharacter(const TText&) {
  577. }
  578. void TCppFullSax::DoWhiteSpace(const TText&) {
  579. }
  580. void TCppFullSax::DoKeyword(const TText&) {
  581. }
  582. void TCppFullSax::DoName(const TText&) {
  583. }
  584. void TCppFullSax::DoOctNumber(const TText&) {
  585. }
  586. void TCppFullSax::DoHexNumber(const TText&) {
  587. }
  588. void TCppFullSax::DoDecNumber(const TText&) {
  589. }
  590. void TCppFullSax::DoFloatNumber(const TText&) {
  591. }
  592. void TCppFullSax::DoSyntax(const TText&) {
  593. }
  594. void TCppFullSax::DoOneLineComment(const TText&) {
  595. }
  596. void TCppFullSax::DoMultiLineComment(const TText&) {
  597. }
  598. void TCppFullSax::DoPreprocessor(const TText&) {
  599. }