parse_enum.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. #include "parse_enum.h"
  2. #include <library/cpp/cppparser/parser.h>
  3. #include <util/stream/file.h>
  4. #include <util/stream/output.h>
  5. #include <util/stream/input.h>
  6. #include <util/stream/mem.h>
  7. #include <util/charset/wide.h>
  8. #include <util/string/strip.h>
  9. #include <util/string/cast.h>
  10. #include <util/generic/map.h>
  11. #include <util/generic/string.h>
  12. #include <util/generic/vector.h>
  13. #include <util/generic/ptr.h>
  14. #include <util/generic/yexception.h>
  15. /**
  16. * Parse C-style strings inside multiline comments
  17. **/
  18. class TValuesContext: public TCppFullSax {
  19. public:
  20. void DoString(const TText& text) override {
  21. Values.push_back(text.Data);
  22. }
  23. ~TValuesContext() override {
  24. }
  25. TVector<TString> Values;
  26. };
  27. static TVector<TString> ParseEnumValues(const TString& strValues) {
  28. TVector<TString> result;
  29. TValuesContext ctx;
  30. TCppSaxParser parser(&ctx);
  31. TMemoryInput in(strValues.data(), strValues.size());
  32. TransferData(static_cast<IInputStream*>(&in), &parser);
  33. parser.Finish();
  34. for (const auto& value : ctx.Values) {
  35. Y_ENSURE(value.size() >= 2, "Invalid C-style string. ");
  36. TString dequoted = value.substr(1, value.size() - 2);
  37. // TODO: support C-unescaping
  38. result.push_back(dequoted);
  39. }
  40. return result;
  41. }
  42. /**
  43. * Parse C++ fragment with one enum
  44. **/
  45. class TEnumContext: public TCppFullSax {
  46. public:
  47. typedef TEnumParser::TItem TItem;
  48. typedef TEnumParser::TEnum TEnum;
  49. TEnumContext(TEnum& currentEnum)
  50. : CurrentEnum(currentEnum)
  51. {
  52. }
  53. ~TEnumContext() override {
  54. }
  55. void AddEnumItem() {
  56. if (!CurrentItem.CppName) {
  57. // uninitialized element should have no value too
  58. Y_ASSERT(!CurrentItem.Value.Defined());
  59. return;
  60. }
  61. // enum item C++ name should not be empty
  62. Y_ASSERT(CurrentItem.CppName);
  63. CurrentItem.NormalizeValue();
  64. CurrentEnum.Items.push_back(CurrentItem);
  65. CurrentItem.Clear();
  66. InEnumState = Begin;
  67. }
  68. template<class T>
  69. void AppendValue(const T& text) {
  70. // by pg@ advice, do not parse enum value
  71. // leave it to C++ compiler to parse/interpret
  72. if (!CurrentItem.Value)
  73. CurrentItem.Value = TString();
  74. *CurrentItem.Value += text;
  75. }
  76. void DoEnd() override {
  77. AddEnumItem();
  78. }
  79. void DoWhiteSpace(const TText& text) override {
  80. if (InValue == InEnumState || InValueCall == InEnumState) {
  81. AppendValue(text.Data);
  82. }
  83. }
  84. void DoSyntax(const TText& text) override {
  85. // For some reason, parser sometimes passes chunks like '{};' here,
  86. // so we handle each symbol separately.
  87. for (const char& sym : text.Data) {
  88. if ('{' == sym && InValue != InEnumState && InValueCall != InEnumState) {
  89. BodyDetected = true;
  90. continue;
  91. } else if ('=' == sym && InValueCall != InEnumState) {
  92. InEnumState = InValue;
  93. continue;
  94. } else if (('(' == sym || '{' == sym) && (InValue == InEnumState || InValueCall == InEnumState)) {
  95. // there may be constexpr function / constructor / macro call in value part,
  96. // handle them appropriately
  97. InEnumState = InValueCall;
  98. ++BracesBalance;
  99. AppendValue(sym);
  100. continue;
  101. } else if ((')' == sym || '}' == sym) && InValueCall == InEnumState) {
  102. if (!--BracesBalance) {
  103. InEnumState = InValue;
  104. }
  105. AppendValue(sym);
  106. continue;
  107. } else if ((',' == sym || '}' == sym) && InValueCall != InEnumState) {
  108. AddEnumItem();
  109. continue;
  110. } else if (InValue == InEnumState || InValueCall == InEnumState) {
  111. AppendValue(sym);
  112. }
  113. }
  114. }
  115. void DoName(const TText& text) override {
  116. if (!BodyDetected) {
  117. return;
  118. }
  119. if (InValue == InEnumState || InValueCall == InEnumState) {
  120. AppendValue(text.Data);
  121. return;
  122. }
  123. CurrentItem.CppName = text.Data;
  124. InEnumState = AfterCppName;
  125. }
  126. void DoKeyword(const TText& text) override {
  127. if (InValue == InEnumState || InValueCall == InEnumState) {
  128. AppendValue(text.Data);
  129. return;
  130. }
  131. }
  132. void DoCharacter(const TText& text) override {
  133. if (InValue == InEnumState || InValueCall == InEnumState) {
  134. AppendValue(text.Data);
  135. return;
  136. }
  137. }
  138. void DoMultiLineComment(const TText& text) override {
  139. Y_ENSURE(text.Data.size() >= 4, "Invalid multiline comment " << text.Data.Quote() << ". ");
  140. TString commentText = text.Data.substr(2, text.Data.size() - 4);
  141. commentText = StripString(commentText);
  142. CurrentItem.CommentText = commentText;
  143. CurrentItem.Aliases = ParseEnumValues(commentText);
  144. if (CurrentItem.Aliases && !CurrentItem.CppName) {
  145. // this means we process multiline comment when item name was not set yet.
  146. ythrow yexception() << "Are you hit with https://clubs.at.yandex-team.ru/stackoverflow/2603 typo? ";
  147. }
  148. }
  149. bool BodyDetected = false;
  150. enum EInEnumState {
  151. Begin,
  152. AfterCppName,
  153. InValue,
  154. InValueCall,
  155. End,
  156. };
  157. EInEnumState InEnumState = Begin;
  158. TEnum& CurrentEnum;
  159. TItem CurrentItem;
  160. size_t BracesBalance = 0;
  161. };
  162. /**
  163. * Parse C++ file
  164. **/
  165. class TCppContext: public TCppFullSax {
  166. public:
  167. typedef TEnumParser::TScope TScope;
  168. typedef TEnumParser::TItem TItem;
  169. typedef TEnumParser::TEnum TEnum;
  170. typedef TEnumParser::TEnums TEnums;
  171. const TString NAMESPACE = "<namespace>";
  172. const TString CLASS = "<class>";
  173. const TString STRUCT = "<struct>";
  174. const TString ENUM = "<enum>";
  175. const TString BLOCK = "<block>";
  176. TCppContext(const char* data, const TString& sourceFileName = TString())
  177. : Data(data)
  178. , SourceFileName(sourceFileName)
  179. {
  180. }
  181. ~TCppContext() override {
  182. }
  183. void DoSyntax(const TText& text) override {
  184. // For some reason, parser sometimes passes chunks like '{};' here,
  185. // so we handle each symbol separately.
  186. const TString& syn = text.Data;
  187. if (syn == "::" && InCompositeNamespace) {
  188. LastScope += syn;
  189. InCompositeNamespace = false;
  190. ScopeDeclaration = true;
  191. return;
  192. }
  193. for (size_t i = 0; i < syn.size(); ++i) {
  194. if ('{' == syn[i]) {
  195. OnEnterScope(text.Offset + i);
  196. if (InEnum) {
  197. CurrentEnum.BodyDetected = true;
  198. }
  199. } else if ('}' == syn[i]) {
  200. OnLeaveScope(text.Offset + i);
  201. } else if (';' == syn[i]) {
  202. // Handle SEARCH-1392
  203. if (InEnum && !CurrentEnum.BodyDetected) {
  204. CurrentEnum.ForwardDeclaration = true;
  205. InEnum = false;
  206. }
  207. }
  208. }
  209. }
  210. void DoKeyword(const TText& text) override {
  211. if (text.Data == "enum") {
  212. Y_ENSURE(!InEnum, "Enums cannot be nested. ");
  213. InEnum = true;
  214. EnumPos = text.Offset;
  215. CurrentEnum.Clear();
  216. CurrentEnum.Scope = Scope;
  217. ScopeDeclaration = true;
  218. NextScopeName = ENUM;
  219. //PrintScope();
  220. } else if (text.Data == "class") {
  221. if (InEnum) {
  222. CurrentEnum.EnumClass = true;
  223. return;
  224. }
  225. NextScopeName = CLASS;
  226. ScopeDeclaration = true;
  227. //PrintScope();
  228. } else if (text.Data == "struct") {
  229. if (InEnum) {
  230. CurrentEnum.EnumClass = true;
  231. return;
  232. }
  233. NextScopeName = STRUCT;
  234. ScopeDeclaration = true;
  235. //PrintScope();
  236. } else if (text.Data == "namespace") {
  237. NextScopeName = NAMESPACE;
  238. LastScope.clear();
  239. ScopeDeclaration = true;
  240. //PrintScope();
  241. }
  242. }
  243. void DoName(const TText& text) override {
  244. if (!ScopeDeclaration) {
  245. return;
  246. }
  247. if (InEnum) {
  248. CurrentEnum.CppName = text.Data;
  249. } else {
  250. if (NextScopeName == NAMESPACE) {
  251. InCompositeNamespace = true;
  252. LastScope += text.Data;
  253. } else {
  254. LastScope = text.Data;
  255. }
  256. }
  257. ScopeDeclaration = false;
  258. }
  259. void OnEnterScope(size_t /* offset */) {
  260. if (ScopeDeclaration) {
  261. // unnamed declaration or typedef
  262. ScopeDeclaration = false;
  263. }
  264. InCompositeNamespace = false;
  265. Scope.push_back(LastScope);
  266. LastScope.clear();
  267. //PrintScope();
  268. }
  269. /// @param offset: terminating curly brace position
  270. void OnLeaveScope(size_t offset) {
  271. if (!Scope) {
  272. size_t contextOffsetBegin = (offset >= 256) ? offset - 256 : 0;
  273. TString codeContext = TString(Data + contextOffsetBegin, offset - contextOffsetBegin + 1);
  274. ythrow yexception() << "C++ source parse failed: unbalanced scope. Did you miss a closing '}' bracket? "
  275. "Context: enum " << CurrentEnum.CppName.Quote() <<
  276. " in scope " << TEnumParser::ScopeStr(CurrentEnum.Scope).Quote() << ". Code context:\n... " <<
  277. codeContext << " ...";
  278. }
  279. Scope.pop_back();
  280. if (InEnum) {
  281. Y_ASSERT(offset > EnumPos);
  282. InEnum = false;
  283. try {
  284. ParseEnum(Data + EnumPos, offset - EnumPos + 1);
  285. } catch (...) {
  286. TString ofFile;
  287. if (SourceFileName) {
  288. ofFile += " of file ";
  289. ofFile += SourceFileName.Quote();
  290. }
  291. ythrow yexception() << "Failed to parse enum " << CurrentEnum.CppName <<
  292. " in scope " << TEnumParser::ScopeStr(CurrentEnum.Scope) << ofFile <<
  293. "\n<C++ parser error message>: " << CurrentExceptionMessage();
  294. }
  295. }
  296. //PrintScope();
  297. }
  298. void ParseEnum(const char* data, size_t length) {
  299. TEnumContext enumContext(CurrentEnum);
  300. TMemoryInput in(data, length);
  301. TCppSaxParser parser(&enumContext);
  302. TransferData(&in, &parser);
  303. parser.Finish();
  304. //PrintEnum(CurrentEnum);
  305. Enums.push_back(CurrentEnum);
  306. }
  307. // Some debug stuff goes here
  308. static void PrintScope(const TScope& scope) {
  309. Cerr << "Current scope: " << TEnumParser::ScopeStr(scope) << Endl;
  310. }
  311. void PrintScope() {
  312. PrintScope(Scope);
  313. }
  314. void PrintEnum(const TEnum& en) {
  315. Cerr << "Enum within scope " << TEnumParser::ScopeStr(en.Scope).Quote() << Endl;
  316. for (const auto& item : en.Items) {
  317. Cerr << " " << item.CppName;
  318. if (item.Value)
  319. Cerr << " = " << *item.Value;
  320. Cerr << Endl;
  321. for (const auto& value : item.Aliases) {
  322. Cerr << " " << value << Endl;
  323. }
  324. }
  325. }
  326. void PrintEnums() {
  327. for (const auto& en : Enums)
  328. PrintEnum(en);
  329. }
  330. public:
  331. TScope Scope;
  332. TEnums Enums;
  333. private:
  334. const char* const Data;
  335. TString SourceFileName;
  336. bool InEnum = false;
  337. bool ScopeDeclaration = false;
  338. bool InCompositeNamespace = false;
  339. TString NextScopeName = BLOCK;
  340. TString LastScope;
  341. size_t EnumPos = 0;
  342. TEnum CurrentEnum;
  343. };
  344. TEnumParser::TEnumParser(const TString& fileName) {
  345. THolder<IInputStream> hIn;
  346. IInputStream* in = nullptr;
  347. if (fileName != "-") {
  348. SourceFileName = fileName;
  349. hIn.Reset(new TFileInput(fileName));
  350. in = hIn.Get();
  351. } else {
  352. in = &Cin;
  353. }
  354. TString contents = in->ReadAll();
  355. Parse(contents.data(), contents.size());
  356. }
  357. TEnumParser::TEnumParser(const char* data, size_t length) {
  358. Parse(data, length);
  359. }
  360. TEnumParser::TEnumParser(IInputStream& in) {
  361. TString contents = in.ReadAll();
  362. Parse(contents.data(), contents.size());
  363. }
  364. void TEnumParser::Parse(const char* dataIn, size_t lengthIn) {
  365. TMemoryInput mi(dataIn, lengthIn);
  366. TString line;
  367. TString result;
  368. while (mi.ReadLine(line)) {
  369. if (line.find("if (GetOwningArena() == other->GetOwningArena()) {") == TString::npos) {
  370. result += line;
  371. result += "\n";
  372. }
  373. }
  374. const char* data = result.c_str();
  375. size_t length = result.length();
  376. const TStringBuf span(data, length);
  377. const bool hasPragmaOnce = span.Contains("#pragma once");
  378. const bool isProtobufHeader = span.Contains("// Generated by the protocol buffer compiler");
  379. const bool isFlatbuffersHeader = span.Contains("// automatically generated by the FlatBuffers compiler");
  380. Y_ENSURE(
  381. hasPragmaOnce || isProtobufHeader || isFlatbuffersHeader,
  382. "Serialization functions can be generated only for enums in header files. "
  383. "A valid header should either contain `#pragma once` or be an protobuf/flatbuf autogenerated header file. "
  384. "See SEARCH-975 for more information. "
  385. );
  386. TCppContext cppContext(data, SourceFileName);
  387. TMemoryInput in(data, length);
  388. TCppSaxParser parser(&cppContext);
  389. TransferData(&in, &parser);
  390. parser.Finish();
  391. // obtain result
  392. Enums = cppContext.Enums;
  393. if (cppContext.Scope) {
  394. cppContext.PrintEnums();
  395. cppContext.PrintScope();
  396. ythrow yexception() << "Unbalanced scope, something is wrong with enum parser. ";
  397. }
  398. }