protobuf_format.cpp 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500
  1. #include "protobuf_format.h"
  2. #include "errors.h"
  3. #include <yt/yt/core/misc/protobuf_helpers.h>
  4. #include <yt/yt_proto/yt/formats/extension.pb.h>
  5. #include <google/protobuf/text_format.h>
  6. #include <library/cpp/yson/node/node_io.h>
  7. #include <util/generic/hash_set.h>
  8. #include <util/generic/stack.h>
  9. #include <util/generic/overloaded.h>
  10. #include <util/stream/output.h>
  11. #include <util/stream/file.h>
  12. namespace NYT::NDetail {
  13. using ::google::protobuf::Descriptor;
  14. using ::google::protobuf::DescriptorProto;
  15. using ::google::protobuf::EnumDescriptor;
  16. using ::google::protobuf::EnumDescriptorProto;
  17. using ::google::protobuf::FieldDescriptor;
  18. using ::google::protobuf::FieldDescriptorProto;
  19. using ::google::protobuf::OneofDescriptor;
  20. using ::google::protobuf::Message;
  21. using ::google::protobuf::FileDescriptor;
  22. using ::google::protobuf::FileDescriptorProto;
  23. using ::google::protobuf::FileDescriptorSet;
  24. using ::google::protobuf::FieldOptions;
  25. using ::google::protobuf::FileOptions;
  26. using ::google::protobuf::OneofOptions;
  27. using ::google::protobuf::MessageOptions;
  28. using ::ToString;
  29. namespace {
  30. ////////////////////////////////////////////////////////////////////////////////
  31. using TOneofOption = std::variant<
  32. EProtobufOneofMode>;
  33. using TFieldOption = std::variant<
  34. EProtobufType,
  35. EProtobufSerializationMode,
  36. EProtobufListMode,
  37. EProtobufMapMode,
  38. EProtobufEnumWritingMode>;
  39. using TMessageOption = std::variant<
  40. EProtobufFieldSortOrder>;
  41. struct TOtherColumns
  42. { };
  43. using TValueTypeOrOtherColumns = std::variant<EValueType, TOtherColumns>;
  44. ////////////////////////////////////////////////////////////////////////////////
  45. TFieldOption FieldFlagToOption(EWrapperFieldFlag::Enum flag)
  46. {
  47. using EFlag = EWrapperFieldFlag;
  48. switch (flag) {
  49. case EFlag::SERIALIZATION_PROTOBUF:
  50. return EProtobufSerializationMode::Protobuf;
  51. case EFlag::SERIALIZATION_YT:
  52. return EProtobufSerializationMode::Yt;
  53. case EFlag::ANY:
  54. return EProtobufType::Any;
  55. case EFlag::OTHER_COLUMNS:
  56. return EProtobufType::OtherColumns;
  57. case EFlag::ENUM_INT:
  58. return EProtobufType::EnumInt;
  59. case EFlag::ENUM_STRING:
  60. return EProtobufType::EnumString;
  61. case EFlag::OPTIONAL_LIST:
  62. return EProtobufListMode::Optional;
  63. case EFlag::REQUIRED_LIST:
  64. return EProtobufListMode::Required;
  65. case EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY:
  66. return EProtobufMapMode::ListOfStructsLegacy;
  67. case EFlag::MAP_AS_LIST_OF_STRUCTS:
  68. return EProtobufMapMode::ListOfStructs;
  69. case EFlag::MAP_AS_DICT:
  70. return EProtobufMapMode::Dict;
  71. case EFlag::MAP_AS_OPTIONAL_DICT:
  72. return EProtobufMapMode::OptionalDict;
  73. case EFlag::EMBEDDED:
  74. return EProtobufSerializationMode::Embedded;
  75. case EFlag::ENUM_SKIP_UNKNOWN_VALUES:
  76. return EProtobufEnumWritingMode::SkipUnknownValues;
  77. case EFlag::ENUM_CHECK_VALUES:
  78. return EProtobufEnumWritingMode::CheckValues;
  79. }
  80. Y_ABORT();
  81. }
  82. TMessageOption MessageFlagToOption(EWrapperMessageFlag::Enum flag)
  83. {
  84. using EFlag = EWrapperMessageFlag;
  85. switch (flag) {
  86. case EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE:
  87. return EProtobufFieldSortOrder::AsInProtoFile;
  88. case EFlag::SORT_FIELDS_BY_FIELD_NUMBER:
  89. return EProtobufFieldSortOrder::ByFieldNumber;
  90. }
  91. Y_ABORT();
  92. }
  93. TOneofOption OneofFlagToOption(EWrapperOneofFlag::Enum flag)
  94. {
  95. using EFlag = EWrapperOneofFlag;
  96. switch (flag) {
  97. case EFlag::SEPARATE_FIELDS:
  98. return EProtobufOneofMode::SeparateFields;
  99. case EFlag::VARIANT:
  100. return EProtobufOneofMode::Variant;
  101. }
  102. Y_ABORT();
  103. }
  104. EWrapperFieldFlag::Enum OptionToFieldFlag(TFieldOption option)
  105. {
  106. using EFlag = EWrapperFieldFlag;
  107. struct TVisitor
  108. {
  109. EFlag::Enum operator() (EProtobufType type)
  110. {
  111. switch (type) {
  112. case EProtobufType::Any:
  113. return EFlag::ANY;
  114. case EProtobufType::OtherColumns:
  115. return EFlag::OTHER_COLUMNS;
  116. case EProtobufType::EnumInt:
  117. return EFlag::ENUM_INT;
  118. case EProtobufType::EnumString:
  119. return EFlag::ENUM_STRING;
  120. }
  121. Y_ABORT();
  122. }
  123. EFlag::Enum operator() (EProtobufSerializationMode serializationMode)
  124. {
  125. switch (serializationMode) {
  126. case EProtobufSerializationMode::Yt:
  127. return EFlag::SERIALIZATION_YT;
  128. case EProtobufSerializationMode::Protobuf:
  129. return EFlag::SERIALIZATION_PROTOBUF;
  130. case EProtobufSerializationMode::Embedded:
  131. return EFlag::EMBEDDED;
  132. }
  133. Y_ABORT();
  134. }
  135. EFlag::Enum operator() (EProtobufListMode listMode)
  136. {
  137. switch (listMode) {
  138. case EProtobufListMode::Optional:
  139. return EFlag::OPTIONAL_LIST;
  140. case EProtobufListMode::Required:
  141. return EFlag::REQUIRED_LIST;
  142. }
  143. Y_ABORT();
  144. }
  145. EFlag::Enum operator() (EProtobufMapMode mapMode)
  146. {
  147. switch (mapMode) {
  148. case EProtobufMapMode::ListOfStructsLegacy:
  149. return EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY;
  150. case EProtobufMapMode::ListOfStructs:
  151. return EFlag::MAP_AS_LIST_OF_STRUCTS;
  152. case EProtobufMapMode::Dict:
  153. return EFlag::MAP_AS_DICT;
  154. case EProtobufMapMode::OptionalDict:
  155. return EFlag::MAP_AS_OPTIONAL_DICT;
  156. }
  157. Y_ABORT();
  158. }
  159. EFlag::Enum operator() (EProtobufEnumWritingMode enumWritingMode)
  160. {
  161. switch (enumWritingMode) {
  162. case EProtobufEnumWritingMode::SkipUnknownValues:
  163. return EFlag::ENUM_SKIP_UNKNOWN_VALUES;
  164. case EProtobufEnumWritingMode::CheckValues:
  165. return EFlag::ENUM_CHECK_VALUES;
  166. }
  167. Y_ABORT();
  168. }
  169. };
  170. return std::visit(TVisitor(), option);
  171. }
  172. EWrapperMessageFlag::Enum OptionToMessageFlag(TMessageOption option)
  173. {
  174. using EFlag = EWrapperMessageFlag;
  175. struct TVisitor
  176. {
  177. EFlag::Enum operator() (EProtobufFieldSortOrder sortOrder)
  178. {
  179. switch (sortOrder) {
  180. case EProtobufFieldSortOrder::AsInProtoFile:
  181. return EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
  182. case EProtobufFieldSortOrder::ByFieldNumber:
  183. return EFlag::SORT_FIELDS_BY_FIELD_NUMBER;
  184. }
  185. Y_ABORT();
  186. }
  187. };
  188. return std::visit(TVisitor(), option);
  189. }
  190. EWrapperOneofFlag::Enum OptionToOneofFlag(TOneofOption option)
  191. {
  192. using EFlag = EWrapperOneofFlag;
  193. struct TVisitor
  194. {
  195. EFlag::Enum operator() (EProtobufOneofMode mode)
  196. {
  197. switch (mode) {
  198. case EProtobufOneofMode::SeparateFields:
  199. return EFlag::SEPARATE_FIELDS;
  200. case EProtobufOneofMode::Variant:
  201. return EFlag::VARIANT;
  202. }
  203. Y_ABORT();
  204. }
  205. };
  206. return std::visit(TVisitor(), option);
  207. }
  208. template <typename T, typename TOptionToFlag>
  209. void SetOption(TMaybe<T>& option, T newOption, TOptionToFlag optionToFlag)
  210. {
  211. if (option) {
  212. if (*option == newOption) {
  213. ythrow yexception() << "Duplicate protobuf flag " << optionToFlag(newOption);
  214. } else {
  215. ythrow yexception() << "Incompatible protobuf flags " <<
  216. optionToFlag(*option) << " and " << optionToFlag(newOption);
  217. }
  218. }
  219. option = newOption;
  220. }
  221. class TParseProtobufFieldOptionsVisitor
  222. {
  223. public:
  224. void operator() (EProtobufType type)
  225. {
  226. SetOption(Type, type);
  227. }
  228. void operator() (EProtobufSerializationMode serializationMode)
  229. {
  230. SetOption(SerializationMode, serializationMode);
  231. }
  232. void operator() (EProtobufListMode listMode)
  233. {
  234. SetOption(ListMode, listMode);
  235. }
  236. void operator() (EProtobufMapMode mapMode)
  237. {
  238. SetOption(MapMode, mapMode);
  239. }
  240. void operator() (EProtobufEnumWritingMode enumWritingMode)
  241. {
  242. SetOption(EnumWritingMode, enumWritingMode);
  243. }
  244. template <typename T>
  245. void SetOption(TMaybe<T>& option, T newOption)
  246. {
  247. NYT::NDetail::SetOption(option, newOption, OptionToFieldFlag);
  248. }
  249. public:
  250. TMaybe<EProtobufType> Type;
  251. TMaybe<EProtobufSerializationMode> SerializationMode;
  252. TMaybe<EProtobufListMode> ListMode;
  253. TMaybe<EProtobufMapMode> MapMode;
  254. TMaybe<EProtobufEnumWritingMode> EnumWritingMode;
  255. };
  256. class TParseProtobufMessageOptionsVisitor
  257. {
  258. public:
  259. void operator() (EProtobufFieldSortOrder fieldSortOrder)
  260. {
  261. SetOption(FieldSortOrder, fieldSortOrder);
  262. }
  263. template <typename T>
  264. void SetOption(TMaybe<T>& option, T newOption)
  265. {
  266. NYT::NDetail::SetOption(option, newOption, OptionToMessageFlag);
  267. }
  268. public:
  269. TMaybe<EProtobufFieldSortOrder> FieldSortOrder;
  270. };
  271. class TParseProtobufOneofOptionsVisitor
  272. {
  273. public:
  274. void operator() (EProtobufOneofMode mode)
  275. {
  276. SetOption(Mode, mode);
  277. }
  278. template <typename T>
  279. void SetOption(TMaybe<T>& option, T newOption)
  280. {
  281. NYT::NDetail::SetOption(option, newOption, OptionToOneofFlag);
  282. }
  283. public:
  284. TMaybe<EProtobufOneofMode> Mode;
  285. };
  286. void ParseProtobufFieldOptions(
  287. const ::google::protobuf::RepeatedField<EWrapperFieldFlag::Enum>& flags,
  288. TProtobufFieldOptions* fieldOptions)
  289. {
  290. TParseProtobufFieldOptionsVisitor visitor;
  291. for (auto flag : flags) {
  292. std::visit(visitor, FieldFlagToOption(flag));
  293. }
  294. if (visitor.Type) {
  295. fieldOptions->Type = *visitor.Type;
  296. }
  297. if (visitor.SerializationMode) {
  298. fieldOptions->SerializationMode = *visitor.SerializationMode;
  299. }
  300. if (visitor.ListMode) {
  301. fieldOptions->ListMode = *visitor.ListMode;
  302. }
  303. if (visitor.MapMode) {
  304. fieldOptions->MapMode = *visitor.MapMode;
  305. }
  306. }
  307. void ParseProtobufMessageOptions(
  308. const ::google::protobuf::RepeatedField<EWrapperMessageFlag::Enum>& flags,
  309. TProtobufMessageOptions* messageOptions)
  310. {
  311. TParseProtobufMessageOptionsVisitor visitor;
  312. for (auto flag : flags) {
  313. std::visit(visitor, MessageFlagToOption(flag));
  314. }
  315. if (visitor.FieldSortOrder) {
  316. messageOptions->FieldSortOrder = *visitor.FieldSortOrder;
  317. }
  318. }
  319. void ParseProtobufOneofOptions(
  320. const ::google::protobuf::RepeatedField<EWrapperOneofFlag::Enum>& flags,
  321. TProtobufOneofOptions* messageOptions)
  322. {
  323. TParseProtobufOneofOptionsVisitor visitor;
  324. for (auto flag : flags) {
  325. std::visit(visitor, OneofFlagToOption(flag));
  326. }
  327. if (visitor.Mode) {
  328. messageOptions->Mode = *visitor.Mode;
  329. }
  330. }
  331. TProtobufFieldOptions GetDefaultFieldOptions(
  332. const Descriptor* descriptor,
  333. TProtobufFieldOptions defaultFieldOptions = {})
  334. {
  335. ParseProtobufFieldOptions(
  336. descriptor->file()->options().GetRepeatedExtension(file_default_field_flags),
  337. &defaultFieldOptions);
  338. ParseProtobufFieldOptions(
  339. descriptor->options().GetRepeatedExtension(default_field_flags),
  340. &defaultFieldOptions);
  341. return defaultFieldOptions;
  342. }
  343. TProtobufOneofOptions GetDefaultOneofOptions(const Descriptor* descriptor)
  344. {
  345. TProtobufOneofOptions defaultOneofOptions;
  346. ParseProtobufOneofOptions(
  347. descriptor->file()->options().GetRepeatedExtension(file_default_oneof_flags),
  348. &defaultOneofOptions);
  349. ParseProtobufOneofOptions(
  350. descriptor->options().GetRepeatedExtension(default_oneof_flags),
  351. &defaultOneofOptions);
  352. switch (defaultOneofOptions.Mode) {
  353. case EProtobufOneofMode::Variant: {
  354. auto defaultFieldOptions = GetDefaultFieldOptions(descriptor);
  355. switch (defaultFieldOptions.SerializationMode) {
  356. case EProtobufSerializationMode::Protobuf:
  357. // For Protobuf serialization mode default is SeparateFields.
  358. defaultOneofOptions.Mode = EProtobufOneofMode::SeparateFields;
  359. return defaultOneofOptions;
  360. case EProtobufSerializationMode::Yt:
  361. case EProtobufSerializationMode::Embedded:
  362. return defaultOneofOptions;
  363. }
  364. Y_ABORT();
  365. }
  366. case EProtobufOneofMode::SeparateFields:
  367. return defaultOneofOptions;
  368. }
  369. Y_ABORT();
  370. }
  371. ////////////////////////////////////////////////////////////////////////////////
  372. void ValidateProtobufType(const FieldDescriptor& fieldDescriptor, EProtobufType protobufType)
  373. {
  374. const auto fieldType = fieldDescriptor.type();
  375. auto ensureType = [&] (FieldDescriptor::Type expectedType) {
  376. Y_ENSURE(fieldType == expectedType,
  377. "Type of field " << fieldDescriptor.name() << "does not match specified field flag " <<
  378. OptionToFieldFlag(protobufType) << ": "
  379. "expected " << FieldDescriptor::TypeName(expectedType) << ", " <<
  380. "got " << FieldDescriptor::TypeName(fieldType));
  381. };
  382. switch (protobufType) {
  383. case EProtobufType::Any:
  384. ensureType(FieldDescriptor::TYPE_BYTES);
  385. return;
  386. case EProtobufType::OtherColumns:
  387. ensureType(FieldDescriptor::TYPE_BYTES);
  388. return;
  389. case EProtobufType::EnumInt:
  390. ensureType(FieldDescriptor::TYPE_ENUM);
  391. return;
  392. case EProtobufType::EnumString:
  393. ensureType(FieldDescriptor::TYPE_ENUM);
  394. return;
  395. }
  396. Y_ABORT();
  397. }
  398. ////////////////////////////////////////////////////////////////////////////////
  399. class TCycleChecker
  400. {
  401. private:
  402. class TGuard
  403. {
  404. public:
  405. TGuard(TCycleChecker* checker, const Descriptor* descriptor)
  406. : Checker_(checker)
  407. , Descriptor_(descriptor)
  408. {
  409. Checker_->ActiveVertices_.insert(Descriptor_);
  410. Checker_->Stack_.push(Descriptor_);
  411. }
  412. ~TGuard()
  413. {
  414. Checker_->ActiveVertices_.erase(Descriptor_);
  415. Checker_->Stack_.pop();
  416. }
  417. private:
  418. TCycleChecker* Checker_;
  419. const Descriptor* Descriptor_;
  420. };
  421. public:
  422. [[nodiscard]] TGuard Enter(const Descriptor* descriptor)
  423. {
  424. if (ActiveVertices_.contains(descriptor)) {
  425. Y_ABORT_UNLESS(!Stack_.empty());
  426. ythrow TApiUsageError() << "Cyclic reference found for protobuf messages. " <<
  427. "Consider removing " << EWrapperFieldFlag::SERIALIZATION_YT << " flag " <<
  428. "somewhere on the cycle containing " <<
  429. Stack_.top()->full_name() << " and " << descriptor->full_name();
  430. }
  431. return TGuard(this, descriptor);
  432. }
  433. private:
  434. THashSet<const Descriptor*> ActiveVertices_;
  435. TStack<const Descriptor*> Stack_;
  436. };
  437. ////////////////////////////////////////////////////////////////////////////////
  438. } // namespace
  439. ////////////////////////////////////////////////////////////////////////////////
  440. TProtobufFieldOptions GetFieldOptions(
  441. const FieldDescriptor* fieldDescriptor,
  442. const TMaybe<TProtobufFieldOptions>& defaultFieldOptions)
  443. {
  444. TProtobufFieldOptions options;
  445. if (defaultFieldOptions) {
  446. options = *defaultFieldOptions;
  447. } else {
  448. options = GetDefaultFieldOptions(fieldDescriptor->containing_type());
  449. }
  450. ParseProtobufFieldOptions(fieldDescriptor->options().GetRepeatedExtension(flags), &options);
  451. return options;
  452. }
  453. TProtobufOneofOptions GetOneofOptions(
  454. const OneofDescriptor* oneofDescriptor,
  455. const TMaybe<TProtobufOneofOptions>& defaultOneofOptions)
  456. {
  457. TProtobufOneofOptions options;
  458. if (defaultOneofOptions) {
  459. options = *defaultOneofOptions;
  460. } else {
  461. options = GetDefaultOneofOptions(oneofDescriptor->containing_type());
  462. }
  463. ParseProtobufOneofOptions(oneofDescriptor->options().GetRepeatedExtension(oneof_flags), &options);
  464. if (oneofDescriptor->is_synthetic()) {
  465. options.Mode = EProtobufOneofMode::SeparateFields;
  466. }
  467. auto variantFieldName = oneofDescriptor->options().GetExtension(variant_field_name);
  468. switch (options.Mode) {
  469. case EProtobufOneofMode::SeparateFields:
  470. if (!variantFieldName.empty()) {
  471. ythrow TApiUsageError() << "\"variant_field_name\" requires (NYT.oneof_flags) = VARIANT";
  472. }
  473. break;
  474. case EProtobufOneofMode::Variant:
  475. if (variantFieldName.empty()) {
  476. options.VariantFieldName = FromProto<TString>(oneofDescriptor->name());
  477. } else {
  478. options.VariantFieldName = variantFieldName;
  479. }
  480. break;
  481. }
  482. return options;
  483. }
  484. TProtobufMessageOptions GetMessageOptions(const Descriptor* descriptor)
  485. {
  486. TProtobufMessageOptions options;
  487. ParseProtobufMessageOptions(
  488. descriptor->file()->options().GetRepeatedExtension(file_default_message_flags),
  489. &options);
  490. ParseProtobufMessageOptions(
  491. descriptor->options().GetRepeatedExtension(message_flags),
  492. &options);
  493. return options;
  494. }
  495. TNode MakeEnumerationConfig(const ::google::protobuf::EnumDescriptor* enumDescriptor)
  496. {
  497. auto config = TNode::CreateMap();
  498. for (int i = 0; i < enumDescriptor->value_count(); ++i) {
  499. config[enumDescriptor->value(i)->name()] = enumDescriptor->value(i)->number();
  500. }
  501. return config;
  502. }
  503. TString DeduceProtobufType(
  504. const FieldDescriptor* fieldDescriptor,
  505. const TProtobufFieldOptions& options)
  506. {
  507. if (options.Type) {
  508. ValidateProtobufType(*fieldDescriptor, *options.Type);
  509. return ToString(*options.Type);
  510. }
  511. switch (fieldDescriptor->type()) {
  512. case FieldDescriptor::TYPE_ENUM:
  513. return ToString(EProtobufType::EnumString);
  514. case FieldDescriptor::TYPE_MESSAGE:
  515. switch (options.SerializationMode) {
  516. case EProtobufSerializationMode::Protobuf:
  517. return "message";
  518. case EProtobufSerializationMode::Yt:
  519. return "structured_message";
  520. case EProtobufSerializationMode::Embedded:
  521. return "embedded_message";
  522. }
  523. Y_ABORT();
  524. default:
  525. return fieldDescriptor->type_name();
  526. }
  527. Y_ABORT();
  528. }
  529. TString GetColumnName(const ::google::protobuf::FieldDescriptor& field)
  530. {
  531. const auto& options = field.options();
  532. const auto columnName = FromProto<TString>(options.GetExtension(column_name));
  533. if (!columnName.empty()) {
  534. return columnName;
  535. }
  536. const auto keyColumnName = FromProto<TString>(options.GetExtension(key_column_name));
  537. if (!keyColumnName.empty()) {
  538. return keyColumnName;
  539. }
  540. return FromProto<TString>(field.name());
  541. }
  542. TNode MakeProtoFormatMessageFieldsConfig(
  543. const Descriptor* descriptor,
  544. TNode* enumerations,
  545. TCycleChecker& cycleChecker);
  546. TNode MakeProtoFormatMessageFieldsConfig(
  547. const Descriptor* descriptor,
  548. TNode* enumerations,
  549. const TProtobufFieldOptions& defaultFieldOptions,
  550. const TProtobufOneofOptions& defaultOneofOptions,
  551. TCycleChecker& cycleChecker);
  552. TNode MakeMapFieldsConfig(
  553. const FieldDescriptor* fieldDescriptor,
  554. TNode* enumerations,
  555. const TProtobufFieldOptions& fieldOptions,
  556. TCycleChecker& cycleChecker)
  557. {
  558. Y_ABORT_UNLESS(fieldDescriptor->is_map());
  559. auto message = fieldDescriptor->message_type();
  560. switch (fieldOptions.MapMode) {
  561. case EProtobufMapMode::ListOfStructsLegacy:
  562. return MakeProtoFormatMessageFieldsConfig(
  563. message,
  564. enumerations,
  565. cycleChecker);
  566. case EProtobufMapMode::ListOfStructs:
  567. case EProtobufMapMode::Dict:
  568. case EProtobufMapMode::OptionalDict: {
  569. TProtobufFieldOptions defaultFieldOptions;
  570. defaultFieldOptions.SerializationMode = EProtobufSerializationMode::Yt;
  571. return MakeProtoFormatMessageFieldsConfig(
  572. message,
  573. enumerations,
  574. defaultFieldOptions,
  575. TProtobufOneofOptions{},
  576. cycleChecker);
  577. }
  578. }
  579. Y_ABORT();
  580. }
  581. TNode MakeProtoFormatFieldConfig(
  582. const FieldDescriptor* fieldDescriptor,
  583. TNode* enumerations,
  584. const TProtobufFieldOptions& defaultOptions,
  585. TCycleChecker& cycleChecker)
  586. {
  587. auto fieldConfig = TNode::CreateMap();
  588. fieldConfig["field_number"] = fieldDescriptor->number();
  589. fieldConfig["name"] = GetColumnName(*fieldDescriptor);
  590. auto fieldOptions = GetFieldOptions(fieldDescriptor, defaultOptions);
  591. Y_ENSURE(fieldOptions.SerializationMode != EProtobufSerializationMode::Embedded,
  592. "EMBEDDED flag is currently supported only with "
  593. "ProtobufFormatWithDescriptors config option set to true");
  594. if (fieldDescriptor->is_repeated()) {
  595. Y_ENSURE_EX(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
  596. TApiUsageError() << "Repeated field \"" << fieldDescriptor->full_name() << "\" " <<
  597. "must have flag \"" << EWrapperFieldFlag::SERIALIZATION_YT << "\"");
  598. }
  599. fieldConfig["repeated"] = fieldDescriptor->is_repeated();
  600. fieldConfig["packed"] = fieldDescriptor->is_packed();
  601. fieldConfig["proto_type"] = DeduceProtobufType(fieldDescriptor, fieldOptions);
  602. if (fieldDescriptor->type() == FieldDescriptor::TYPE_ENUM) {
  603. auto* enumeration = fieldDescriptor->enum_type();
  604. (*enumerations)[enumeration->full_name()] = MakeEnumerationConfig(enumeration);
  605. fieldConfig["enumeration_name"] = FromProto<TString>(enumeration->full_name());
  606. }
  607. if (fieldOptions.SerializationMode != EProtobufSerializationMode::Yt) {
  608. return fieldConfig;
  609. }
  610. if (fieldDescriptor->is_map()) {
  611. fieldConfig["fields"] = MakeMapFieldsConfig(fieldDescriptor, enumerations, fieldOptions, cycleChecker);
  612. return fieldConfig;
  613. }
  614. if (fieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE) {
  615. fieldConfig["fields"] = MakeProtoFormatMessageFieldsConfig(
  616. fieldDescriptor->message_type(),
  617. enumerations,
  618. cycleChecker);
  619. }
  620. return fieldConfig;
  621. }
  622. void MakeProtoFormatOneofConfig(
  623. const OneofDescriptor* oneofDescriptor,
  624. TNode* enumerations,
  625. const TProtobufFieldOptions& defaultFieldOptions,
  626. const TProtobufOneofOptions& defaultOneofOptions,
  627. TCycleChecker& cycleChecker,
  628. TNode* fields)
  629. {
  630. auto addFields = [&] (TNode* fields) {
  631. for (int i = 0; i < oneofDescriptor->field_count(); ++i) {
  632. fields->Add(MakeProtoFormatFieldConfig(
  633. oneofDescriptor->field(i),
  634. enumerations,
  635. defaultFieldOptions,
  636. cycleChecker));
  637. }
  638. };
  639. auto oneofOptions = GetOneofOptions(oneofDescriptor, defaultOneofOptions);
  640. switch (oneofOptions.Mode) {
  641. case EProtobufOneofMode::SeparateFields:
  642. addFields(fields);
  643. return;
  644. case EProtobufOneofMode::Variant: {
  645. auto oneofFields = TNode::CreateList();
  646. addFields(&oneofFields);
  647. auto oneofField = TNode()
  648. ("proto_type", "oneof")
  649. ("name", oneofOptions.VariantFieldName)
  650. ("fields", std::move(oneofFields));
  651. fields->Add(std::move(oneofField));
  652. return;
  653. }
  654. }
  655. Y_ABORT();
  656. }
  657. TNode MakeProtoFormatMessageFieldsConfig(
  658. const Descriptor* descriptor,
  659. TNode* enumerations,
  660. const TProtobufFieldOptions& defaultFieldOptions,
  661. const TProtobufOneofOptions& defaultOneofOptions,
  662. TCycleChecker& cycleChecker)
  663. {
  664. auto fields = TNode::CreateList();
  665. THashSet<const OneofDescriptor*> visitedOneofs;
  666. auto guard = cycleChecker.Enter(descriptor);
  667. for (int fieldIndex = 0; fieldIndex < descriptor->field_count(); ++fieldIndex) {
  668. auto fieldDescriptor = descriptor->field(fieldIndex);
  669. auto oneofDescriptor = fieldDescriptor->containing_oneof();
  670. if (!oneofDescriptor) {
  671. fields.Add(MakeProtoFormatFieldConfig(
  672. fieldDescriptor,
  673. enumerations,
  674. defaultFieldOptions,
  675. cycleChecker));
  676. } else if (!visitedOneofs.contains(oneofDescriptor)) {
  677. MakeProtoFormatOneofConfig(
  678. oneofDescriptor,
  679. enumerations,
  680. defaultFieldOptions,
  681. defaultOneofOptions,
  682. cycleChecker,
  683. &fields);
  684. visitedOneofs.insert(oneofDescriptor);
  685. }
  686. }
  687. return fields;
  688. }
  689. TNode MakeProtoFormatMessageFieldsConfig(
  690. const Descriptor* descriptor,
  691. TNode* enumerations,
  692. TCycleChecker& cycleChecker)
  693. {
  694. return MakeProtoFormatMessageFieldsConfig(
  695. descriptor,
  696. enumerations,
  697. GetDefaultFieldOptions(descriptor),
  698. GetDefaultOneofOptions(descriptor),
  699. cycleChecker);
  700. }
  701. TNode MakeProtoFormatConfigWithTables(const TVector<const Descriptor*>& descriptors)
  702. {
  703. TNode config("protobuf");
  704. config.Attributes()
  705. ("enumerations", TNode::CreateMap())
  706. ("tables", TNode::CreateList());
  707. auto& enumerations = config.Attributes()["enumerations"];
  708. for (auto* descriptor : descriptors) {
  709. TCycleChecker cycleChecker;
  710. auto columns = MakeProtoFormatMessageFieldsConfig(descriptor, &enumerations, cycleChecker);
  711. config.Attributes()["tables"].Add(
  712. TNode()("columns", std::move(columns)));
  713. }
  714. return config;
  715. }
  716. ////////////////////////////////////////////////////////////////////////////////
  717. class TFileDescriptorSetBuilder
  718. {
  719. public:
  720. TFileDescriptorSetBuilder()
  721. : ExtensionFile_(EWrapperFieldFlag::descriptor()->file())
  722. { }
  723. void AddDescriptor(const Descriptor* descriptor)
  724. {
  725. auto [it, inserted] = AllDescriptors_.insert(descriptor);
  726. if (!inserted) {
  727. return;
  728. }
  729. const auto* containingType = descriptor->containing_type();
  730. while (containingType) {
  731. AddDescriptor(containingType);
  732. containingType = containingType->containing_type();
  733. }
  734. for (int i = 0; i < descriptor->field_count(); ++i) {
  735. AddField(descriptor->field(i));
  736. }
  737. }
  738. FileDescriptorSet Build()
  739. {
  740. THashSet<const FileDescriptor*> visitedFiles;
  741. TVector<const FileDescriptor*> fileTopoOrder;
  742. for (const auto* descriptor : AllDescriptors_) {
  743. TraverseDependencies(descriptor->file(), visitedFiles, fileTopoOrder);
  744. }
  745. THashSet<TString> messageTypeNames;
  746. THashSet<TString> enumTypeNames;
  747. for (const auto* descriptor : AllDescriptors_) {
  748. messageTypeNames.insert(FromProto<TString>(descriptor->full_name()));
  749. }
  750. for (const auto* enumDescriptor : EnumDescriptors_) {
  751. enumTypeNames.insert(FromProto<TString>(enumDescriptor->full_name()));
  752. }
  753. FileDescriptorSet fileDescriptorSetProto;
  754. for (const auto* file : fileTopoOrder) {
  755. auto* fileProto = fileDescriptorSetProto.add_file();
  756. file->CopyTo(fileProto);
  757. Strip(fileProto, messageTypeNames, enumTypeNames);
  758. }
  759. return fileDescriptorSetProto;
  760. }
  761. private:
  762. void AddField(const FieldDescriptor* fieldDescriptor)
  763. {
  764. if (fieldDescriptor->message_type()) {
  765. AddDescriptor(fieldDescriptor->message_type());
  766. }
  767. if (fieldDescriptor->enum_type()) {
  768. AddEnumDescriptor(fieldDescriptor->enum_type());
  769. }
  770. }
  771. void AddEnumDescriptor(const EnumDescriptor* enumDescriptor)
  772. {
  773. auto [it, inserted] = EnumDescriptors_.insert(enumDescriptor);
  774. if (!inserted) {
  775. return;
  776. }
  777. const auto* containingType = enumDescriptor->containing_type();
  778. while (containingType) {
  779. AddDescriptor(containingType);
  780. containingType = containingType->containing_type();
  781. }
  782. }
  783. void TraverseDependencies(
  784. const FileDescriptor* current,
  785. THashSet<const FileDescriptor*>& visited,
  786. TVector<const FileDescriptor*>& topoOrder)
  787. {
  788. auto [it, inserted] = visited.insert(current);
  789. if (!inserted) {
  790. return;
  791. }
  792. for (int i = 0; i < current->dependency_count(); ++i) {
  793. TraverseDependencies(current->dependency(i), visited, topoOrder);
  794. }
  795. topoOrder.push_back(current);
  796. }
  797. template <typename TOptions>
  798. void StripUnknownOptions(TOptions* options)
  799. {
  800. std::vector<const FieldDescriptor*> fields;
  801. auto reflection = options->GetReflection();
  802. reflection->ListFields(*options, &fields);
  803. for (auto field : fields) {
  804. if (field->is_extension() && field->file() != ExtensionFile_) {
  805. reflection->ClearField(options, field);
  806. }
  807. }
  808. }
  809. template <typename TRepeatedField, typename TPredicate>
  810. void RemoveIf(TRepeatedField* repeatedField, TPredicate predicate)
  811. {
  812. repeatedField->erase(
  813. std::remove_if(repeatedField->begin(), repeatedField->end(), predicate),
  814. repeatedField->end());
  815. }
  816. void Strip(
  817. const TString& containingTypePrefix,
  818. DescriptorProto* messageProto,
  819. const THashSet<TString>& messageTypeNames,
  820. const THashSet<TString>& enumTypeNames)
  821. {
  822. const auto prefix = containingTypePrefix + messageProto->name() + '.';
  823. RemoveIf(messageProto->mutable_nested_type(), [&] (const DescriptorProto& descriptorProto) {
  824. return !messageTypeNames.contains(prefix + descriptorProto.name());
  825. });
  826. RemoveIf(messageProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
  827. return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
  828. });
  829. messageProto->clear_extension();
  830. StripUnknownOptions(messageProto->mutable_options());
  831. for (auto& fieldProto : *messageProto->mutable_field()) {
  832. StripUnknownOptions(fieldProto.mutable_options());
  833. }
  834. for (auto& oneofProto : *messageProto->mutable_oneof_decl()) {
  835. StripUnknownOptions(oneofProto.mutable_options());
  836. }
  837. for (auto& nestedTypeProto : *messageProto->mutable_nested_type()) {
  838. Strip(prefix, &nestedTypeProto, messageTypeNames, enumTypeNames);
  839. }
  840. for (auto& enumProto : *messageProto->mutable_enum_type()) {
  841. StripUnknownOptions(enumProto.mutable_options());
  842. for (auto& enumValue : *enumProto.mutable_value()) {
  843. StripUnknownOptions(enumValue.mutable_options());
  844. }
  845. }
  846. }
  847. void Strip(
  848. FileDescriptorProto* fileProto,
  849. const THashSet<TString>& messageTypeNames,
  850. const THashSet<TString>& enumTypeNames)
  851. {
  852. const auto prefix = fileProto->package().empty()
  853. ? ""
  854. : FromProto<TString>(fileProto->package()) + '.';
  855. RemoveIf(fileProto->mutable_message_type(), [&] (const DescriptorProto& descriptorProto) {
  856. return !messageTypeNames.contains(prefix + descriptorProto.name());
  857. });
  858. RemoveIf(fileProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
  859. return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
  860. });
  861. fileProto->clear_service();
  862. fileProto->clear_extension();
  863. StripUnknownOptions(fileProto->mutable_options());
  864. for (auto& messageProto : *fileProto->mutable_message_type()) {
  865. Strip(prefix, &messageProto, messageTypeNames, enumTypeNames);
  866. }
  867. for (auto& enumProto : *fileProto->mutable_enum_type()) {
  868. StripUnknownOptions(enumProto.mutable_options());
  869. for (auto& enumValue : *enumProto.mutable_value()) {
  870. StripUnknownOptions(enumValue.mutable_options());
  871. }
  872. }
  873. }
  874. private:
  875. const FileDescriptor* const ExtensionFile_;
  876. THashSet<const Descriptor*> AllDescriptors_;
  877. THashSet<const EnumDescriptor*> EnumDescriptors_;
  878. };
  879. TNode MakeProtoFormatConfigWithDescriptors(const TVector<const Descriptor*>& descriptors)
  880. {
  881. TFileDescriptorSetBuilder builder;
  882. auto typeNames = TNode::CreateList();
  883. for (const auto* descriptor : descriptors) {
  884. builder.AddDescriptor(descriptor);
  885. typeNames.Add(FromProto<TString>(descriptor->full_name()));
  886. }
  887. auto fileDescriptorSetText = FromProto<TString>(builder.Build().ShortDebugString());
  888. TNode config("protobuf");
  889. config.Attributes()
  890. ("file_descriptor_set_text", std::move(fileDescriptorSetText))
  891. ("type_names", std::move(typeNames));
  892. return config;
  893. }
  894. ////////////////////////////////////////////////////////////////////////////////
  895. using TTypePtrOrOtherColumns = std::variant<NTi::TTypePtr, TOtherColumns>;
  896. struct TMember {
  897. TString Name;
  898. TTypePtrOrOtherColumns TypeOrOtherColumns;
  899. };
  900. ////////////////////////////////////////////////////////////////////////////////
  901. TValueTypeOrOtherColumns GetScalarFieldType(
  902. const FieldDescriptor& fieldDescriptor,
  903. const TProtobufFieldOptions& options)
  904. {
  905. if (options.Type) {
  906. switch (*options.Type) {
  907. case EProtobufType::EnumInt:
  908. return EValueType::VT_INT64;
  909. case EProtobufType::EnumString:
  910. return EValueType::VT_STRING;
  911. case EProtobufType::Any:
  912. return EValueType::VT_ANY;
  913. case EProtobufType::OtherColumns:
  914. return TOtherColumns{};
  915. }
  916. Y_ABORT();
  917. }
  918. switch (fieldDescriptor.cpp_type()) {
  919. case FieldDescriptor::CPPTYPE_INT32:
  920. return EValueType::VT_INT32;
  921. case FieldDescriptor::CPPTYPE_INT64:
  922. return EValueType::VT_INT64;
  923. case FieldDescriptor::CPPTYPE_UINT32:
  924. return EValueType::VT_UINT32;
  925. case FieldDescriptor::CPPTYPE_UINT64:
  926. return EValueType::VT_UINT64;
  927. case FieldDescriptor::CPPTYPE_FLOAT:
  928. case FieldDescriptor::CPPTYPE_DOUBLE:
  929. return EValueType::VT_DOUBLE;
  930. case FieldDescriptor::CPPTYPE_BOOL:
  931. return EValueType::VT_BOOLEAN;
  932. case FieldDescriptor::CPPTYPE_STRING:
  933. case FieldDescriptor::CPPTYPE_MESSAGE:
  934. case FieldDescriptor::CPPTYPE_ENUM:
  935. return EValueType::VT_STRING;
  936. default:
  937. ythrow yexception() <<
  938. "Unexpected field type '" << fieldDescriptor.cpp_type_name() << "' " <<
  939. "for field " << fieldDescriptor.name();
  940. }
  941. }
  942. bool HasNameExtension(const FieldDescriptor& fieldDescriptor)
  943. {
  944. const auto& options = fieldDescriptor.options();
  945. return options.HasExtension(column_name) || options.HasExtension(key_column_name);
  946. }
  947. void SortFields(TVector<const FieldDescriptor*>& fieldDescriptors, EProtobufFieldSortOrder fieldSortOrder)
  948. {
  949. switch (fieldSortOrder) {
  950. case EProtobufFieldSortOrder::AsInProtoFile:
  951. return;
  952. case EProtobufFieldSortOrder::ByFieldNumber:
  953. SortBy(fieldDescriptors, [] (const FieldDescriptor* fieldDescriptor) {
  954. return fieldDescriptor->number();
  955. });
  956. return;
  957. }
  958. Y_ABORT();
  959. }
  960. NTi::TTypePtr CreateStruct(TStringBuf fieldName, TVector<TMember> members)
  961. {
  962. TVector<NTi::TStructType::TOwnedMember> structMembers;
  963. structMembers.reserve(members.size());
  964. for (auto& member : members) {
  965. std::visit(TOverloaded{
  966. [&] (TOtherColumns) {
  967. ythrow TApiUsageError() <<
  968. "Could not deduce YT type for field " << member.Name << " of " <<
  969. "embedded message field " << fieldName << " " <<
  970. "(note that " << EWrapperFieldFlag::OTHER_COLUMNS << " fields " <<
  971. "are not allowed inside embedded messages)";
  972. },
  973. [&] (NTi::TTypePtr& type) {
  974. structMembers.emplace_back(std::move(member.Name), std::move(type));
  975. },
  976. }, member.TypeOrOtherColumns);
  977. }
  978. return NTi::Struct(std::move(structMembers));
  979. }
  980. TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor)
  981. {
  982. auto isOtherColumns = [] (const ::google::protobuf::FieldDescriptor& field) {
  983. return GetFieldOptions(&field).Type == EProtobufType::OtherColumns;
  984. };
  985. TVector<TString> result;
  986. result.reserve(descriptor.field_count());
  987. for (int i = 0; i < descriptor.field_count(); ++i) {
  988. const auto& field = *descriptor.field(i);
  989. if (isOtherColumns(field)) {
  990. return {};
  991. }
  992. result.push_back(GetColumnName(field));
  993. }
  994. return result;
  995. }
  996. ////////////////////////////////////////////////////////////////////////////////
  997. class TTableSchemaInferrer
  998. {
  999. public:
  1000. TTableSchemaInferrer(bool keepFieldsWithoutExtension)
  1001. : KeepFieldsWithoutExtension_(keepFieldsWithoutExtension)
  1002. { }
  1003. TTableSchema InferSchema(const Descriptor& messageDescriptor);
  1004. private:
  1005. TTypePtrOrOtherColumns GetFieldType(
  1006. const FieldDescriptor& fieldDescriptor,
  1007. const TProtobufFieldOptions& defaultOptions);
  1008. void ProcessOneofField(
  1009. TStringBuf containingFieldName,
  1010. const OneofDescriptor& oneofDescriptor,
  1011. const TProtobufFieldOptions& defaultFieldOptions,
  1012. const TProtobufOneofOptions& defaultOneofOptions,
  1013. EProtobufFieldSortOrder fieldSortOrder,
  1014. TVector<TMember>* members);
  1015. TVector<TMember> GetMessageMembers(
  1016. TStringBuf containingFieldName,
  1017. const Descriptor& fieldDescriptor,
  1018. TProtobufFieldOptions defaultFieldOptions,
  1019. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder = std::nullopt);
  1020. NTi::TTypePtr GetMessageType(
  1021. const FieldDescriptor& fieldDescriptor,
  1022. TProtobufFieldOptions defaultFieldOptions);
  1023. NTi::TTypePtr GetMapType(
  1024. const FieldDescriptor& fieldDescriptor,
  1025. const TProtobufFieldOptions& fieldOptions);
  1026. private:
  1027. void GetMessageMembersImpl(
  1028. TStringBuf containingFieldName,
  1029. const Descriptor& fieldDescriptor,
  1030. TProtobufFieldOptions defaultFieldOptions,
  1031. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
  1032. TVector<TMember>* members);
  1033. private:
  1034. const bool KeepFieldsWithoutExtension_;
  1035. TCycleChecker CycleChecker_;
  1036. };
  1037. void TTableSchemaInferrer::ProcessOneofField(
  1038. TStringBuf containingFieldName,
  1039. const OneofDescriptor& oneofDescriptor,
  1040. const TProtobufFieldOptions& defaultFieldOptions,
  1041. const TProtobufOneofOptions& defaultOneofOptions,
  1042. EProtobufFieldSortOrder fieldSortOrder,
  1043. TVector<TMember>* members)
  1044. {
  1045. auto oneofOptions = GetOneofOptions(&oneofDescriptor, defaultOneofOptions);
  1046. auto addFields = [&] (TVector<TMember>* members, bool removeOptionality) {
  1047. TVector<const FieldDescriptor*> fieldDescriptors;
  1048. for (int i = 0; i < oneofDescriptor.field_count(); ++i) {
  1049. fieldDescriptors.push_back(oneofDescriptor.field(i));
  1050. }
  1051. SortFields(fieldDescriptors, fieldSortOrder);
  1052. for (auto innerFieldDescriptor : fieldDescriptors) {
  1053. auto typeOrOtherColumns = GetFieldType(
  1054. *innerFieldDescriptor,
  1055. defaultFieldOptions);
  1056. if (auto* maybeType = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1057. maybeType && removeOptionality && (*maybeType)->IsOptional())
  1058. {
  1059. typeOrOtherColumns = (*maybeType)->AsOptional()->GetItemType();
  1060. }
  1061. members->push_back(TMember{
  1062. GetColumnName(*innerFieldDescriptor),
  1063. std::move(typeOrOtherColumns),
  1064. });
  1065. }
  1066. };
  1067. switch (oneofOptions.Mode) {
  1068. case EProtobufOneofMode::SeparateFields:
  1069. addFields(members, /* removeOptionality */ false);
  1070. return;
  1071. case EProtobufOneofMode::Variant: {
  1072. TVector<TMember> variantMembers;
  1073. addFields(&variantMembers, /* removeOptionality */ true);
  1074. members->push_back(TMember{
  1075. oneofOptions.VariantFieldName,
  1076. NTi::Optional(
  1077. NTi::Variant(
  1078. CreateStruct(containingFieldName, std::move(variantMembers))
  1079. )
  1080. )
  1081. });
  1082. return;
  1083. }
  1084. }
  1085. Y_ABORT();
  1086. }
  1087. TVector<TMember> TTableSchemaInferrer::GetMessageMembers(
  1088. TStringBuf containingFieldName,
  1089. const Descriptor& messageDescriptor,
  1090. TProtobufFieldOptions defaultFieldOptions,
  1091. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder)
  1092. {
  1093. TVector<TMember> members;
  1094. GetMessageMembersImpl(
  1095. containingFieldName,
  1096. messageDescriptor,
  1097. defaultFieldOptions,
  1098. overrideFieldSortOrder,
  1099. &members
  1100. );
  1101. return members;
  1102. }
  1103. void TTableSchemaInferrer::GetMessageMembersImpl(
  1104. TStringBuf containingFieldName,
  1105. const Descriptor& messageDescriptor,
  1106. TProtobufFieldOptions defaultFieldOptions,
  1107. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
  1108. TVector<TMember>* members)
  1109. {
  1110. auto guard = CycleChecker_.Enter(&messageDescriptor);
  1111. defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor, defaultFieldOptions);
  1112. auto messageOptions = GetMessageOptions(&messageDescriptor);
  1113. auto defaultOneofOptions = GetDefaultOneofOptions(&messageDescriptor);
  1114. TVector<const FieldDescriptor*> fieldDescriptors;
  1115. fieldDescriptors.reserve(messageDescriptor.field_count());
  1116. for (int i = 0; i < messageDescriptor.field_count(); ++i) {
  1117. if (!KeepFieldsWithoutExtension_ && !HasNameExtension(*messageDescriptor.field(i))) {
  1118. continue;
  1119. }
  1120. fieldDescriptors.push_back(messageDescriptor.field(i));
  1121. }
  1122. auto fieldSortOrder = overrideFieldSortOrder.value_or(messageOptions.FieldSortOrder);
  1123. SortFields(fieldDescriptors, fieldSortOrder);
  1124. THashSet<const OneofDescriptor*> visitedOneofs;
  1125. for (const auto innerFieldDescriptor : fieldDescriptors) {
  1126. auto oneofDescriptor = innerFieldDescriptor->containing_oneof();
  1127. if (oneofDescriptor) {
  1128. if (visitedOneofs.contains(oneofDescriptor)) {
  1129. continue;
  1130. }
  1131. ProcessOneofField(
  1132. containingFieldName,
  1133. *oneofDescriptor,
  1134. defaultFieldOptions,
  1135. defaultOneofOptions,
  1136. messageOptions.FieldSortOrder,
  1137. members);
  1138. visitedOneofs.insert(oneofDescriptor);
  1139. continue;
  1140. }
  1141. auto fieldOptions = GetFieldOptions(innerFieldDescriptor, defaultFieldOptions);
  1142. if (fieldOptions.SerializationMode == EProtobufSerializationMode::Embedded) {
  1143. Y_ENSURE(innerFieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE,
  1144. "EMBEDDED column must have message type");
  1145. Y_ENSURE(innerFieldDescriptor->label() == FieldDescriptor::LABEL_REQUIRED,
  1146. "EMBEDDED column must be marked required");
  1147. GetMessageMembersImpl(
  1148. innerFieldDescriptor->full_name(),
  1149. *innerFieldDescriptor->message_type(),
  1150. defaultFieldOptions,
  1151. /*overrideFieldSortOrder*/ std::nullopt,
  1152. members);
  1153. } else {
  1154. auto typeOrOtherColumns = GetFieldType(
  1155. *innerFieldDescriptor,
  1156. defaultFieldOptions);
  1157. members->push_back(TMember{
  1158. GetColumnName(*innerFieldDescriptor),
  1159. std::move(typeOrOtherColumns),
  1160. });
  1161. }
  1162. }
  1163. }
  1164. NTi::TTypePtr TTableSchemaInferrer::GetMessageType(
  1165. const FieldDescriptor& fieldDescriptor,
  1166. TProtobufFieldOptions defaultFieldOptions)
  1167. {
  1168. Y_ABORT_UNLESS(fieldDescriptor.message_type());
  1169. const auto& messageDescriptor = *fieldDescriptor.message_type();
  1170. auto members = GetMessageMembers(
  1171. fieldDescriptor.full_name(),
  1172. messageDescriptor,
  1173. defaultFieldOptions);
  1174. return CreateStruct(fieldDescriptor.full_name(), std::move(members));
  1175. }
  1176. NTi::TTypePtr TTableSchemaInferrer::GetMapType(
  1177. const FieldDescriptor& fieldDescriptor,
  1178. const TProtobufFieldOptions& fieldOptions)
  1179. {
  1180. Y_ABORT_UNLESS(fieldDescriptor.is_map());
  1181. switch (fieldOptions.MapMode) {
  1182. case EProtobufMapMode::ListOfStructsLegacy:
  1183. case EProtobufMapMode::ListOfStructs: {
  1184. TProtobufFieldOptions embeddedOptions;
  1185. if (fieldOptions.MapMode == EProtobufMapMode::ListOfStructs) {
  1186. embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
  1187. }
  1188. auto list = NTi::List(GetMessageType(fieldDescriptor, embeddedOptions));
  1189. switch (fieldOptions.ListMode) {
  1190. case EProtobufListMode::Required:
  1191. return list;
  1192. case EProtobufListMode::Optional:
  1193. return NTi::Optional(std::move(list));
  1194. }
  1195. Y_ABORT();
  1196. }
  1197. case EProtobufMapMode::Dict:
  1198. case EProtobufMapMode::OptionalDict: {
  1199. auto message = fieldDescriptor.message_type();
  1200. Y_ABORT_UNLESS(message->field_count() == 2);
  1201. auto keyVariant = GetScalarFieldType(*message->field(0), TProtobufFieldOptions{});
  1202. Y_ABORT_UNLESS(std::holds_alternative<EValueType>(keyVariant));
  1203. auto key = std::get<EValueType>(keyVariant);
  1204. TProtobufFieldOptions embeddedOptions;
  1205. embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
  1206. auto valueVariant = GetFieldType(*message->field(1), embeddedOptions);
  1207. Y_ABORT_UNLESS(std::holds_alternative<NTi::TTypePtr>(valueVariant));
  1208. auto value = std::get<NTi::TTypePtr>(valueVariant);
  1209. Y_ABORT_UNLESS(value->IsOptional());
  1210. value = value->AsOptional()->GetItemType();
  1211. auto dict = NTi::Dict(ToTypeV3(key, true), value);
  1212. if (fieldOptions.MapMode == EProtobufMapMode::OptionalDict) {
  1213. return NTi::Optional(dict);
  1214. } else {
  1215. return dict;
  1216. }
  1217. }
  1218. }
  1219. }
  1220. TTypePtrOrOtherColumns TTableSchemaInferrer::GetFieldType(
  1221. const FieldDescriptor& fieldDescriptor,
  1222. const TProtobufFieldOptions& defaultOptions)
  1223. {
  1224. auto fieldOptions = GetFieldOptions(&fieldDescriptor, defaultOptions);
  1225. if (fieldOptions.Type) {
  1226. ValidateProtobufType(fieldDescriptor, *fieldOptions.Type);
  1227. }
  1228. auto getScalarType = [&] {
  1229. auto valueTypeOrOtherColumns = GetScalarFieldType(fieldDescriptor, fieldOptions);
  1230. return std::visit(TOverloaded{
  1231. [] (TOtherColumns) -> TTypePtrOrOtherColumns {
  1232. return TOtherColumns{};
  1233. },
  1234. [] (EValueType valueType) -> TTypePtrOrOtherColumns {
  1235. return ToTypeV3(valueType, true);
  1236. }
  1237. }, valueTypeOrOtherColumns);
  1238. };
  1239. auto withFieldLabel = [&] (const TTypePtrOrOtherColumns& typeOrOtherColumns) -> TTypePtrOrOtherColumns {
  1240. switch (fieldDescriptor.label()) {
  1241. case FieldDescriptor::Label::LABEL_REPEATED: {
  1242. Y_ENSURE(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
  1243. "Repeated fields are supported only for YT serialization mode, field \"" + fieldDescriptor.full_name() +
  1244. "\" has incorrect serialization mode");
  1245. auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1246. Y_ENSURE(type, "OTHER_COLUMNS field can not be repeated");
  1247. switch (fieldOptions.ListMode) {
  1248. case EProtobufListMode::Required:
  1249. return NTi::TTypePtr(NTi::List(*type));
  1250. case EProtobufListMode::Optional:
  1251. return NTi::TTypePtr(NTi::Optional(NTi::List(*type)));
  1252. }
  1253. Y_ABORT();
  1254. }
  1255. case FieldDescriptor::Label::LABEL_OPTIONAL:
  1256. return std::visit(TOverloaded{
  1257. [] (TOtherColumns) -> TTypePtrOrOtherColumns {
  1258. return TOtherColumns{};
  1259. },
  1260. [] (NTi::TTypePtr type) -> TTypePtrOrOtherColumns {
  1261. return NTi::TTypePtr(NTi::Optional(std::move(type)));
  1262. }
  1263. }, typeOrOtherColumns);
  1264. case FieldDescriptor::LABEL_REQUIRED: {
  1265. auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1266. Y_ENSURE(type, "OTHER_COLUMNS field can not be required");
  1267. return *type;
  1268. }
  1269. }
  1270. Y_ABORT();
  1271. };
  1272. switch (fieldOptions.SerializationMode) {
  1273. case EProtobufSerializationMode::Protobuf:
  1274. return withFieldLabel(getScalarType());
  1275. case EProtobufSerializationMode::Yt:
  1276. if (fieldDescriptor.type() == FieldDescriptor::TYPE_MESSAGE) {
  1277. if (fieldDescriptor.is_map()) {
  1278. return GetMapType(fieldDescriptor, fieldOptions);
  1279. } else {
  1280. return withFieldLabel(GetMessageType(fieldDescriptor, TProtobufFieldOptions{}));
  1281. }
  1282. } else {
  1283. return withFieldLabel(getScalarType());
  1284. }
  1285. case EProtobufSerializationMode::Embedded:
  1286. ythrow yexception() << "EMBEDDED field is not allowed for field "
  1287. << fieldDescriptor.full_name();
  1288. }
  1289. Y_ABORT();
  1290. }
  1291. TTableSchema TTableSchemaInferrer::InferSchema(const Descriptor& messageDescriptor)
  1292. {
  1293. TTableSchema result;
  1294. auto defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor);
  1295. auto members = GetMessageMembers(
  1296. messageDescriptor.full_name(),
  1297. messageDescriptor,
  1298. defaultFieldOptions,
  1299. // Use special sort order for top level messages.
  1300. /*overrideFieldSortOrder*/ EProtobufFieldSortOrder::AsInProtoFile);
  1301. for (auto& member : members) {
  1302. std::visit(TOverloaded{
  1303. [&] (TOtherColumns) {
  1304. result.Strict(false);
  1305. },
  1306. [&] (NTi::TTypePtr& type) {
  1307. result.AddColumn(TColumnSchema()
  1308. .Name(std::move(member.Name))
  1309. .Type(std::move(type))
  1310. );
  1311. },
  1312. }, member.TypeOrOtherColumns);
  1313. }
  1314. return result;
  1315. }
  1316. TTableSchema CreateTableSchemaImpl(
  1317. const Descriptor& messageDescriptor,
  1318. bool keepFieldsWithoutExtension)
  1319. {
  1320. TTableSchemaInferrer inferrer(keepFieldsWithoutExtension);
  1321. return inferrer.InferSchema(messageDescriptor);
  1322. }
  1323. ////////////////////////////////////////////////////////////////////////////////
  1324. } // namespace NYT::NDetail
  1325. ////////////////////////////////////////////////////////////////////////////////
  1326. template <>
  1327. void Out<NYT::EWrapperFieldFlag::Enum>(IOutputStream& stream, NYT::EWrapperFieldFlag::Enum value)
  1328. {
  1329. stream << NYT::EWrapperFieldFlag_Enum_Name(value);
  1330. }
  1331. template <>
  1332. void Out<NYT::EWrapperMessageFlag::Enum>(IOutputStream& stream, NYT::EWrapperMessageFlag::Enum value)
  1333. {
  1334. stream << NYT::EWrapperMessageFlag_Enum_Name(value);
  1335. }
  1336. template <>
  1337. void Out<NYT::EWrapperOneofFlag::Enum>(IOutputStream& stream, NYT::EWrapperOneofFlag::Enum value)
  1338. {
  1339. stream << NYT::EWrapperOneofFlag_Enum_Name(value);
  1340. }