parser.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. #include <util/generic/hash.h>
  2. #include <util/string/cast.h>
  3. #include <util/generic/hash_set.h>
  4. #include <util/generic/yexception.h>
  5. #include "parser.h"
  6. //#define DEBUG_ME 1
  7. TCppSaxParser::TText::TText()
  8. : Offset(0)
  9. {
  10. }
  11. TCppSaxParser::TText::TText(ui64 offset)
  12. : Offset(offset)
  13. {
  14. }
  15. TCppSaxParser::TText::TText(const TString& data, ui64 offset)
  16. : Data(data)
  17. , Offset(offset)
  18. {
  19. }
  20. TCppSaxParser::TText::~TText() = default;
  21. void TCppSaxParser::TText::Reset() noexcept {
  22. Offset += Data.length();
  23. Data.clear();
  24. }
  25. TCppSaxParser::TWorker::TWorker() noexcept = default;
  26. TCppSaxParser::TWorker::~TWorker() = default;
  27. class TCppSaxParser::TImpl {
  28. enum EState {
  29. Code,
  30. CommentBegin,
  31. String,
  32. Character,
  33. OneLineComment,
  34. MultiLineComment,
  35. MultiLineCommentEnd,
  36. Preprocessor
  37. };
  38. public:
  39. typedef TCppSaxParser::TText TText;
  40. typedef TCppSaxParser::TWorker TWorker;
  41. inline TImpl(TWorker* worker)
  42. : State_(Code)
  43. , Worker_(worker)
  44. , SkipNext_(false)
  45. , Line_(0)
  46. , Column_(0)
  47. {
  48. Worker_->DoStart();
  49. }
  50. inline ~TImpl() = default;
  51. inline void Write(const void* data, size_t len) {
  52. ProcessInput((const char*)data, len);
  53. }
  54. inline void Finish() {
  55. if (!Text_.Data.empty()) {
  56. switch (State_) {
  57. case Code:
  58. Worker_->DoCode(Text_);
  59. break;
  60. case Preprocessor:
  61. Worker_->DoPreprocessor(Text_);
  62. break;
  63. case OneLineComment:
  64. Worker_->DoOneLineComment(Text_);
  65. break;
  66. default:
  67. ThrowError();
  68. }
  69. }
  70. Worker_->DoEnd();
  71. }
  72. private:
  73. inline void ProcessInput(const char* data, size_t len) {
  74. EState savedState = Code;
  75. while (len) {
  76. const char ch = *data;
  77. if (ch == '\n') {
  78. ++Line_;
  79. Column_ = 0;
  80. } else {
  81. ++Column_;
  82. }
  83. #if DEBUG_ME
  84. Cerr << "char: " << ch << Endl;
  85. Cerr << "state before: " << (unsigned int)State_ << Endl;
  86. #endif
  87. retry:
  88. switch (State_) {
  89. case Code: {
  90. savedState = Code;
  91. switch (ch) {
  92. case '/':
  93. State_ = CommentBegin;
  94. break;
  95. case '"':
  96. Action(ch);
  97. State_ = String;
  98. break;
  99. case '\'':
  100. Action(ch);
  101. State_ = Character;
  102. break;
  103. case '#':
  104. Action(ch);
  105. State_ = Preprocessor;
  106. break;
  107. default:
  108. Text_.Data += ch;
  109. break;
  110. }
  111. break;
  112. }
  113. case CommentBegin: {
  114. switch (ch) {
  115. case '/':
  116. State_ = savedState;
  117. savedState = Code;
  118. Action("//");
  119. State_ = OneLineComment;
  120. break;
  121. case '*':
  122. State_ = savedState;
  123. Action("/*");
  124. State_ = MultiLineComment;
  125. break;
  126. default:
  127. Text_.Data += '/';
  128. State_ = savedState;
  129. goto retry;
  130. }
  131. break;
  132. }
  133. case OneLineComment: {
  134. switch (ch) {
  135. case '\n':
  136. Action(ch);
  137. State_ = Code;
  138. break;
  139. default:
  140. Text_.Data += ch;
  141. break;
  142. }
  143. break;
  144. }
  145. case MultiLineComment: {
  146. switch (ch) {
  147. case '*':
  148. Text_.Data += ch;
  149. State_ = MultiLineCommentEnd;
  150. break;
  151. case '\n':
  152. Text_.Data += ch;
  153. savedState = Code;
  154. break;
  155. default:
  156. Text_.Data += ch;
  157. break;
  158. }
  159. break;
  160. }
  161. case MultiLineCommentEnd: {
  162. switch (ch) {
  163. case '/':
  164. Text_.Data += ch;
  165. Action();
  166. State_ = savedState;
  167. break;
  168. default:
  169. State_ = MultiLineComment;
  170. goto retry;
  171. }
  172. break;
  173. }
  174. case String: {
  175. switch (ch) {
  176. case '"':
  177. Text_.Data += ch;
  178. if (SkipNext_) {
  179. SkipNext_ = false;
  180. } else {
  181. if (savedState == Code) {
  182. Action();
  183. }
  184. State_ = savedState;
  185. }
  186. break;
  187. case '\\':
  188. Text_.Data += ch;
  189. SkipNext_ = !SkipNext_;
  190. break;
  191. default:
  192. Text_.Data += ch;
  193. SkipNext_ = false;
  194. break;
  195. }
  196. break;
  197. }
  198. case Character: {
  199. switch (ch) {
  200. case '\'':
  201. Text_.Data += ch;
  202. if (SkipNext_) {
  203. SkipNext_ = false;
  204. } else {
  205. if (savedState == Code) {
  206. Action();
  207. }
  208. State_ = savedState;
  209. }
  210. break;
  211. case '\\':
  212. Text_.Data += ch;
  213. SkipNext_ = !SkipNext_;
  214. break;
  215. default:
  216. Text_.Data += ch;
  217. SkipNext_ = false;
  218. break;
  219. }
  220. break;
  221. }
  222. case Preprocessor: {
  223. savedState = Preprocessor;
  224. switch (ch) {
  225. case '/':
  226. State_ = CommentBegin;
  227. break;
  228. case '\'':
  229. Text_.Data += ch;
  230. State_ = Character;
  231. break;
  232. case '"':
  233. Text_.Data += ch;
  234. State_ = String;
  235. break;
  236. case '\n':
  237. Text_.Data += ch;
  238. if (SkipNext_) {
  239. SkipNext_ = false;
  240. } else {
  241. Action();
  242. savedState = Code;
  243. State_ = Code;
  244. }
  245. break;
  246. case '\\':
  247. Text_.Data += ch;
  248. SkipNext_ = true;
  249. break;
  250. default:
  251. Text_.Data += ch;
  252. SkipNext_ = false;
  253. break;
  254. }
  255. break;
  256. }
  257. default:
  258. ThrowError();
  259. }
  260. #if DEBUG_ME
  261. Cerr << "state after: " << (unsigned int)State_ << Endl;
  262. #endif
  263. ++data;
  264. --len;
  265. }
  266. }
  267. inline void Action(char ch) {
  268. Action();
  269. Text_.Data += ch;
  270. }
  271. inline void Action(const char* st) {
  272. Action();
  273. Text_.Data += st;
  274. }
  275. inline void Action() {
  276. switch (State_) {
  277. case Code:
  278. Worker_->DoCode(Text_);
  279. break;
  280. case OneLineComment:
  281. Worker_->DoOneLineComment(Text_);
  282. break;
  283. case MultiLineCommentEnd:
  284. Worker_->DoMultiLineComment(Text_);
  285. break;
  286. case Preprocessor:
  287. Worker_->DoPreprocessor(Text_);
  288. break;
  289. case String:
  290. Worker_->DoString(Text_);
  291. break;
  292. case Character:
  293. Worker_->DoCharacter(Text_);
  294. break;
  295. default:
  296. ThrowError();
  297. }
  298. Text_.Reset();
  299. }
  300. inline void ThrowError() const {
  301. ythrow yexception() << "can not parse source(line = " << (unsigned)Line_ + 1 << ", column = " << (unsigned)Column_ + 1 << ")";
  302. }
  303. private:
  304. EState State_;
  305. TWorker* Worker_;
  306. TText Text_;
  307. bool SkipNext_;
  308. ui64 Line_;
  309. ui64 Column_;
  310. };
  311. TCppSaxParser::TCppSaxParser(TWorker* worker)
  312. : Impl_(new TImpl(worker))
  313. {
  314. }
  315. TCppSaxParser::~TCppSaxParser() = default;
  316. void TCppSaxParser::DoWrite(const void* data, size_t len) {
  317. Impl_->Write(data, len);
  318. }
  319. void TCppSaxParser::DoFinish() {
  320. Impl_->Finish();
  321. }
  322. TCppSimpleSax::TCppSimpleSax() noexcept {
  323. }
  324. TCppSimpleSax::~TCppSimpleSax() = default;
  325. void TCppSimpleSax::DoCode(const TText& text) {
  326. static const char char_types[] = {
  327. 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2,
  328. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  329. 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  330. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
  331. 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  332. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1,
  333. 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  334. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
  335. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  336. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  337. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  338. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  339. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  340. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  341. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  342. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
  343. static const char CWHITESPACE = 0;
  344. static const char CIDENTIFIER = 1;
  345. static const char CSYNTAX = 2;
  346. enum EState {
  347. WhiteSpace = CWHITESPACE,
  348. Identifier = CIDENTIFIER,
  349. Syntax = CSYNTAX
  350. };
  351. EState state = Identifier;
  352. TText cur(text.Offset);
  353. for (const auto& it : text.Data) {
  354. const unsigned char ch = *(const unsigned char*)(&it);
  355. const char type = char_types[ch];
  356. switch (state) {
  357. case Identifier: {
  358. switch (type) {
  359. case CIDENTIFIER:
  360. cur.Data += ch;
  361. break;
  362. default:
  363. if (!cur.Data.empty()) {
  364. DoIdentifier(cur);
  365. }
  366. cur.Reset();
  367. cur.Data += ch;
  368. state = (EState)type;
  369. break;
  370. }
  371. break;
  372. }
  373. case WhiteSpace: {
  374. switch (type) {
  375. case CWHITESPACE:
  376. cur.Data += ch;
  377. break;
  378. default:
  379. DoWhiteSpace(cur);
  380. cur.Reset();
  381. cur.Data += ch;
  382. state = (EState)type;
  383. break;
  384. }
  385. break;
  386. }
  387. case Syntax: {
  388. switch (type) {
  389. case CSYNTAX:
  390. cur.Data += ch;
  391. break;
  392. default:
  393. DoSyntax(cur);
  394. cur.Reset();
  395. cur.Data += ch;
  396. state = (EState)type;
  397. break;
  398. }
  399. break;
  400. }
  401. }
  402. }
  403. if (!cur.Data.empty()) {
  404. switch (state) {
  405. case Identifier:
  406. DoIdentifier(cur);
  407. break;
  408. case WhiteSpace:
  409. DoWhiteSpace(cur);
  410. break;
  411. case Syntax:
  412. DoSyntax(cur);
  413. break;
  414. }
  415. }
  416. }
  417. class TCppFullSax::TImpl {
  418. typedef THashSet<TString> TKeyWords;
  419. class TRegExp {
  420. public:
  421. inline TRegExp(const char*) {
  422. }
  423. inline bool Match(const TString& /*s*/) const noexcept {
  424. return false;
  425. }
  426. };
  427. public:
  428. inline TImpl()
  429. : OctNumber_("^[+-]?0[0-7]+$")
  430. , HexNumber_("^[+-]?0x[0-9A-Fa-f]+$")
  431. , DecNumber_("^[+-]?[0-9]+$")
  432. , FltNumber_("^[+-]?[0-9]*\\.[0-9]*$")
  433. {
  434. AddKeyword("extern");
  435. AddKeyword("static");
  436. AddKeyword("inline");
  437. AddKeyword("volatile");
  438. AddKeyword("asm");
  439. AddKeyword("const");
  440. AddKeyword("mutable");
  441. AddKeyword("char");
  442. AddKeyword("signed");
  443. AddKeyword("unsigned");
  444. AddKeyword("int");
  445. AddKeyword("short");
  446. AddKeyword("long");
  447. AddKeyword("double");
  448. AddKeyword("float");
  449. AddKeyword("bool");
  450. AddKeyword("class");
  451. AddKeyword("struct");
  452. AddKeyword("union");
  453. AddKeyword("void");
  454. AddKeyword("auto");
  455. AddKeyword("throw");
  456. AddKeyword("try");
  457. AddKeyword("catch");
  458. AddKeyword("for");
  459. AddKeyword("do");
  460. AddKeyword("if");
  461. AddKeyword("else");
  462. AddKeyword("while");
  463. AddKeyword("switch");
  464. AddKeyword("case");
  465. AddKeyword("default");
  466. AddKeyword("goto");
  467. AddKeyword("break");
  468. AddKeyword("continue");
  469. AddKeyword("virtual");
  470. AddKeyword("template");
  471. AddKeyword("typename");
  472. AddKeyword("enum");
  473. AddKeyword("public");
  474. AddKeyword("private");
  475. AddKeyword("protected");
  476. AddKeyword("using");
  477. AddKeyword("namespace");
  478. AddKeyword("typedef");
  479. AddKeyword("true");
  480. AddKeyword("false");
  481. AddKeyword("return");
  482. AddKeyword("new");
  483. AddKeyword("delete");
  484. AddKeyword("operator");
  485. AddKeyword("friend");
  486. AddKeyword("this");
  487. }
  488. inline ~TImpl() = default;
  489. inline void AddKeyword(const TString& keyword) {
  490. KeyWords_.insert(keyword);
  491. }
  492. inline bool IsKeyword(const TString& s) {
  493. return KeyWords_.find(s) != KeyWords_.end();
  494. }
  495. inline bool IsOctNumber(const TString& s) {
  496. return OctNumber_.Match(s);
  497. }
  498. inline bool IsHexNumber(const TString& s) {
  499. return HexNumber_.Match(s);
  500. }
  501. inline bool IsDecNumber(const TString& s) {
  502. return DecNumber_.Match(s);
  503. }
  504. inline bool IsFloatNumber(const TString& s) {
  505. return FltNumber_.Match(s);
  506. }
  507. private:
  508. const TRegExp OctNumber_;
  509. const TRegExp HexNumber_;
  510. const TRegExp DecNumber_;
  511. const TRegExp FltNumber_;
  512. TKeyWords KeyWords_;
  513. };
  514. TCppFullSax::TCppFullSax()
  515. : Impl_(new TImpl())
  516. {
  517. }
  518. TCppFullSax::~TCppFullSax() = default;
  519. void TCppFullSax::AddKeyword(const TString& keyword) {
  520. Impl_->AddKeyword(keyword);
  521. }
  522. void TCppFullSax::DoIdentifier(const TText& text) {
  523. if (Impl_->IsKeyword(text.Data)) {
  524. DoKeyword(text);
  525. } else if (Impl_->IsOctNumber(text.Data)) {
  526. DoOctNumber(text);
  527. } else if (Impl_->IsHexNumber(text.Data)) {
  528. DoHexNumber(text);
  529. } else if (Impl_->IsDecNumber(text.Data)) {
  530. DoDecNumber(text);
  531. } else if (Impl_->IsFloatNumber(text.Data)) {
  532. DoFloatNumber(text);
  533. } else {
  534. DoName(text);
  535. }
  536. }
  537. void TCppFullSax::DoEnd() {
  538. }
  539. void TCppFullSax::DoStart() {
  540. }
  541. void TCppFullSax::DoString(const TText&) {
  542. }
  543. void TCppFullSax::DoCharacter(const TText&) {
  544. }
  545. void TCppFullSax::DoWhiteSpace(const TText&) {
  546. }
  547. void TCppFullSax::DoKeyword(const TText&) {
  548. }
  549. void TCppFullSax::DoName(const TText&) {
  550. }
  551. void TCppFullSax::DoOctNumber(const TText&) {
  552. }
  553. void TCppFullSax::DoHexNumber(const TText&) {
  554. }
  555. void TCppFullSax::DoDecNumber(const TText&) {
  556. }
  557. void TCppFullSax::DoFloatNumber(const TText&) {
  558. }
  559. void TCppFullSax::DoSyntax(const TText&) {
  560. }
  561. void TCppFullSax::DoOneLineComment(const TText&) {
  562. }
  563. void TCppFullSax::DoMultiLineComment(const TText&) {
  564. }
  565. void TCppFullSax::DoPreprocessor(const TText&) {
  566. }