parse_enum.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. #include "parse_enum.h"
  2. #include <library/cpp/cppparser/parser.h>
  3. #include <util/stream/file.h>
  4. #include <util/stream/output.h>
  5. #include <util/stream/input.h>
  6. #include <util/stream/mem.h>
  7. #include <util/charset/wide.h>
  8. #include <util/string/strip.h>
  9. #include <util/string/cast.h>
  10. #include <util/generic/map.h>
  11. #include <util/generic/string.h>
  12. #include <util/generic/vector.h>
  13. #include <util/generic/ptr.h>
  14. #include <util/generic/yexception.h>
  15. /**
  16. * Parse C-style strings inside multiline comments
  17. **/
  18. class TValuesContext: public TCppFullSax {
  19. public:
  20. void DoString(const TText& text) override {
  21. Values.push_back(text.Data);
  22. }
  23. ~TValuesContext() override {
  24. }
  25. TVector<TString> Values;
  26. };
  27. static TVector<TString> ParseEnumValues(const TString& strValues) {
  28. TVector<TString> result;
  29. TValuesContext ctx;
  30. TCppSaxParser parser(&ctx);
  31. TMemoryInput in(strValues.data(), strValues.size());
  32. TransferData(static_cast<IInputStream*>(&in), &parser);
  33. parser.Finish();
  34. for (const auto& value : ctx.Values) {
  35. Y_ENSURE(value.size() >= 2, "Invalid C-style string. ");
  36. TString dequoted = value.substr(1, value.size() - 2);
  37. // TODO: support C-unescaping
  38. result.push_back(dequoted);
  39. }
  40. return result;
  41. }
  42. /**
  43. * Parse C++ fragment with one enum
  44. **/
  45. class TEnumContext: public TCppFullSax {
  46. public:
  47. typedef TEnumParser::TItem TItem;
  48. typedef TEnumParser::TEnum TEnum;
  49. TEnumContext(TEnum& currentEnum)
  50. : CurrentEnum(currentEnum)
  51. {
  52. }
  53. ~TEnumContext() override {
  54. }
  55. void AddEnumItem() {
  56. if (!CurrentItem.CppName) {
  57. // uninitialized element should have no value too
  58. Y_ASSERT(!CurrentItem.Value.Defined());
  59. return;
  60. }
  61. // enum item C++ name should not be empty
  62. Y_ASSERT(CurrentItem.CppName);
  63. CurrentItem.NormalizeValue();
  64. CurrentEnum.Items.push_back(CurrentItem);
  65. CurrentItem.Clear();
  66. InEnumState = Begin;
  67. }
  68. template<class T>
  69. void AppendValue(const T& text) {
  70. // by pg@ advice, do not parse enum value
  71. // leave it to C++ compiler to parse/interpret
  72. if (!CurrentItem.Value)
  73. CurrentItem.Value = TString();
  74. *CurrentItem.Value += text;
  75. }
  76. void DoEnd() override {
  77. AddEnumItem();
  78. }
  79. void DoWhiteSpace(const TText& text) override {
  80. if (InValue == InEnumState || InValueCall == InEnumState) {
  81. AppendValue(text.Data);
  82. }
  83. }
  84. void DoSyntax(const TText& text) override {
  85. // For some reason, parser sometimes passes chunks like '{};' here,
  86. // so we handle each symbol separately.
  87. for (const char& sym : text.Data) {
  88. if ('{' == sym && InValue != InEnumState && InValueCall != InEnumState) {
  89. BodyDetected = true;
  90. continue;
  91. } else if ('=' == sym && InValueCall != InEnumState) {
  92. InEnumState = InValue;
  93. continue;
  94. } else if (('(' == sym || '{' == sym) && (InValue == InEnumState || InValueCall == InEnumState)) {
  95. // there may be constexpr function / constructor / macro call in value part,
  96. // handle them appropriately
  97. InEnumState = InValueCall;
  98. ++BracesBalance;
  99. AppendValue(sym);
  100. continue;
  101. } else if ((')' == sym || '}' == sym) && InValueCall == InEnumState) {
  102. if (!--BracesBalance) {
  103. InEnumState = InValue;
  104. }
  105. AppendValue(sym);
  106. continue;
  107. } else if ((',' == sym || '}' == sym) && InValueCall != InEnumState) {
  108. AddEnumItem();
  109. continue;
  110. } else if (InValue == InEnumState || InValueCall == InEnumState) {
  111. AppendValue(sym);
  112. }
  113. }
  114. }
  115. void DoName(const TText& text) override {
  116. if (!BodyDetected) {
  117. return;
  118. }
  119. if (InValue == InEnumState || InValueCall == InEnumState) {
  120. AppendValue(text.Data);
  121. return;
  122. }
  123. CurrentItem.CppName = text.Data;
  124. InEnumState = AfterCppName;
  125. }
  126. void DoMultiLineComment(const TText& text) override {
  127. Y_ENSURE(text.Data.size() >= 4, "Invalid multiline comment " << text.Data.Quote() << ". ");
  128. TString commentText = text.Data.substr(2, text.Data.size() - 4);
  129. commentText = StripString(commentText);
  130. CurrentItem.CommentText = commentText;
  131. CurrentItem.Aliases = ParseEnumValues(commentText);
  132. if (CurrentItem.Aliases && !CurrentItem.CppName) {
  133. // this means we process multiline comment when item name was not set yet.
  134. ythrow yexception() << "Are you hit with https://clubs.at.yandex-team.ru/stackoverflow/2603 typo? ";
  135. }
  136. }
  137. bool BodyDetected = false;
  138. enum EInEnumState {
  139. Begin,
  140. AfterCppName,
  141. InValue,
  142. InValueCall,
  143. End,
  144. };
  145. EInEnumState InEnumState = Begin;
  146. TEnum& CurrentEnum;
  147. TItem CurrentItem;
  148. size_t BracesBalance = 0;
  149. };
  150. /**
  151. * Parse C++ file
  152. **/
  153. class TCppContext: public TCppFullSax {
  154. public:
  155. typedef TEnumParser::TScope TScope;
  156. typedef TEnumParser::TItem TItem;
  157. typedef TEnumParser::TEnum TEnum;
  158. typedef TEnumParser::TEnums TEnums;
  159. const TString NAMESPACE = "<namespace>";
  160. const TString CLASS = "<class>";
  161. const TString STRUCT = "<struct>";
  162. const TString ENUM = "<enum>";
  163. const TString BLOCK = "<block>";
  164. TCppContext(const char* data, const TString& sourceFileName = TString())
  165. : Data(data)
  166. , SourceFileName(sourceFileName)
  167. {
  168. }
  169. ~TCppContext() override {
  170. }
  171. void DoSyntax(const TText& text) override {
  172. // For some reason, parser sometimes passes chunks like '{};' here,
  173. // so we handle each symbol separately.
  174. const TString& syn = text.Data;
  175. if (syn == "::" && InCompositeNamespace) {
  176. LastScope += syn;
  177. InCompositeNamespace = false;
  178. ScopeDeclaration = true;
  179. return;
  180. }
  181. for (size_t i = 0; i < syn.size(); ++i) {
  182. if ('{' == syn[i]) {
  183. OnEnterScope(text.Offset + i);
  184. if (InEnum) {
  185. CurrentEnum.BodyDetected = true;
  186. }
  187. } else if ('}' == syn[i]) {
  188. OnLeaveScope(text.Offset + i);
  189. } else if (';' == syn[i]) {
  190. // Handle SEARCH-1392
  191. if (InEnum && !CurrentEnum.BodyDetected) {
  192. CurrentEnum.ForwardDeclaration = true;
  193. InEnum = false;
  194. }
  195. }
  196. }
  197. }
  198. void DoKeyword(const TText& text) override {
  199. if (text.Data == "enum") {
  200. Y_ENSURE(!InEnum, "Enums cannot be nested. ");
  201. InEnum = true;
  202. EnumPos = text.Offset;
  203. CurrentEnum.Clear();
  204. CurrentEnum.Scope = Scope;
  205. ScopeDeclaration = true;
  206. NextScopeName = ENUM;
  207. //PrintScope();
  208. } else if (text.Data == "class") {
  209. if (InEnum) {
  210. CurrentEnum.EnumClass = true;
  211. return;
  212. }
  213. NextScopeName = CLASS;
  214. ScopeDeclaration = true;
  215. //PrintScope();
  216. } else if (text.Data == "struct") {
  217. if (InEnum) {
  218. CurrentEnum.EnumClass = true;
  219. return;
  220. }
  221. NextScopeName = STRUCT;
  222. ScopeDeclaration = true;
  223. //PrintScope();
  224. } else if (text.Data == "namespace") {
  225. NextScopeName = NAMESPACE;
  226. LastScope.clear();
  227. ScopeDeclaration = true;
  228. //PrintScope();
  229. }
  230. }
  231. void DoName(const TText& text) override {
  232. if (!ScopeDeclaration) {
  233. return;
  234. }
  235. if (InEnum) {
  236. CurrentEnum.CppName = text.Data;
  237. } else {
  238. if (NextScopeName == NAMESPACE) {
  239. InCompositeNamespace = true;
  240. LastScope += text.Data;
  241. } else {
  242. LastScope = text.Data;
  243. }
  244. }
  245. ScopeDeclaration = false;
  246. }
  247. void OnEnterScope(size_t /* offset */) {
  248. if (ScopeDeclaration) {
  249. // unnamed declaration or typedef
  250. ScopeDeclaration = false;
  251. }
  252. InCompositeNamespace = false;
  253. Scope.push_back(LastScope);
  254. LastScope.clear();
  255. //PrintScope();
  256. }
  257. /// @param offset: terminating curly brace position
  258. void OnLeaveScope(size_t offset) {
  259. if (!Scope) {
  260. size_t contextOffsetBegin = (offset >= 256) ? offset - 256 : 0;
  261. TString codeContext = TString(Data + contextOffsetBegin, offset - contextOffsetBegin + 1);
  262. ythrow yexception() << "C++ source parse failed: unbalanced scope. Did you miss a closing '}' bracket? "
  263. "Context: enum " << CurrentEnum.CppName.Quote() <<
  264. " in scope " << TEnumParser::ScopeStr(CurrentEnum.Scope).Quote() << ". Code context:\n... " <<
  265. codeContext << " ...";
  266. }
  267. Scope.pop_back();
  268. if (InEnum) {
  269. Y_ASSERT(offset > EnumPos);
  270. InEnum = false;
  271. try {
  272. ParseEnum(Data + EnumPos, offset - EnumPos + 1);
  273. } catch (...) {
  274. TString ofFile;
  275. if (SourceFileName) {
  276. ofFile += " of file ";
  277. ofFile += SourceFileName.Quote();
  278. }
  279. ythrow yexception() << "Failed to parse enum " << CurrentEnum.CppName <<
  280. " in scope " << TEnumParser::ScopeStr(CurrentEnum.Scope) << ofFile <<
  281. "\n<C++ parser error message>: " << CurrentExceptionMessage();
  282. }
  283. }
  284. //PrintScope();
  285. }
  286. void ParseEnum(const char* data, size_t length) {
  287. TEnumContext enumContext(CurrentEnum);
  288. TMemoryInput in(data, length);
  289. TCppSaxParser parser(&enumContext);
  290. TransferData(&in, &parser);
  291. parser.Finish();
  292. //PrintEnum(CurrentEnum);
  293. Enums.push_back(CurrentEnum);
  294. }
  295. // Some debug stuff goes here
  296. static void PrintScope(const TScope& scope) {
  297. Cerr << "Current scope: " << TEnumParser::ScopeStr(scope) << Endl;
  298. }
  299. void PrintScope() {
  300. PrintScope(Scope);
  301. }
  302. void PrintEnum(const TEnum& en) {
  303. Cerr << "Enum within scope " << TEnumParser::ScopeStr(en.Scope).Quote() << Endl;
  304. for (const auto& item : en.Items) {
  305. Cerr << " " << item.CppName;
  306. if (item.Value)
  307. Cerr << " = " << *item.Value;
  308. Cerr << Endl;
  309. for (const auto& value : item.Aliases) {
  310. Cerr << " " << value << Endl;
  311. }
  312. }
  313. }
  314. void PrintEnums() {
  315. for (const auto& en : Enums)
  316. PrintEnum(en);
  317. }
  318. public:
  319. TScope Scope;
  320. TEnums Enums;
  321. private:
  322. const char* const Data;
  323. TString SourceFileName;
  324. bool InEnum = false;
  325. bool ScopeDeclaration = false;
  326. bool InCompositeNamespace = false;
  327. TString NextScopeName = BLOCK;
  328. TString LastScope;
  329. size_t EnumPos = 0;
  330. TEnum CurrentEnum;
  331. };
  332. TEnumParser::TEnumParser(const TString& fileName) {
  333. THolder<IInputStream> hIn;
  334. IInputStream* in = nullptr;
  335. if (fileName != "-") {
  336. SourceFileName = fileName;
  337. hIn.Reset(new TFileInput(fileName));
  338. in = hIn.Get();
  339. } else {
  340. in = &Cin;
  341. }
  342. TString contents = in->ReadAll();
  343. Parse(contents.data(), contents.size());
  344. }
  345. TEnumParser::TEnumParser(const char* data, size_t length) {
  346. Parse(data, length);
  347. }
  348. TEnumParser::TEnumParser(IInputStream& in) {
  349. TString contents = in.ReadAll();
  350. Parse(contents.data(), contents.size());
  351. }
  352. void TEnumParser::Parse(const char* dataIn, size_t lengthIn) {
  353. TMemoryInput mi(dataIn, lengthIn);
  354. TString line;
  355. TString result;
  356. while (mi.ReadLine(line)) {
  357. if (line.find("if (GetOwningArena() == other->GetOwningArena()) {") == TString::npos) {
  358. result += line;
  359. result += "\n";
  360. }
  361. }
  362. const char* data = result.c_str();
  363. size_t length = result.length();
  364. const TStringBuf span(data, length);
  365. const bool hasPragmaOnce = span.Contains("#pragma once");
  366. const bool isProtobufHeader = span.Contains("// Generated by the protocol buffer compiler");
  367. const bool isFlatbuffersHeader = span.Contains("// automatically generated by the FlatBuffers compiler");
  368. Y_ENSURE(
  369. hasPragmaOnce || isProtobufHeader || isFlatbuffersHeader,
  370. "Serialization functions can be generated only for enums in header files. "
  371. "A valid header should either contain `#pragma once` or be an protobuf/flatbuf autogenerated header file. "
  372. "See SEARCH-975 for more information. "
  373. );
  374. TCppContext cppContext(data, SourceFileName);
  375. TMemoryInput in(data, length);
  376. TCppSaxParser parser(&cppContext);
  377. TransferData(&in, &parser);
  378. parser.Finish();
  379. // obtain result
  380. Enums = cppContext.Enums;
  381. if (cppContext.Scope) {
  382. cppContext.PrintEnums();
  383. cppContext.PrintScope();
  384. ythrow yexception() << "Unbalanced scope, something is wrong with enum parser. ";
  385. }
  386. }