protobuf_format.cpp 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498
  1. #include "protobuf_format.h"
  2. #include "errors.h"
  3. #include <yt/yt_proto/yt/formats/extension.pb.h>
  4. #include <google/protobuf/text_format.h>
  5. #include <library/cpp/yson/node/node_io.h>
  6. #include <util/generic/hash_set.h>
  7. #include <util/generic/stack.h>
  8. #include <util/generic/overloaded.h>
  9. #include <util/stream/output.h>
  10. #include <util/stream/file.h>
  11. namespace NYT::NDetail {
  12. using ::google::protobuf::Descriptor;
  13. using ::google::protobuf::DescriptorProto;
  14. using ::google::protobuf::EnumDescriptor;
  15. using ::google::protobuf::EnumDescriptorProto;
  16. using ::google::protobuf::FieldDescriptor;
  17. using ::google::protobuf::FieldDescriptorProto;
  18. using ::google::protobuf::OneofDescriptor;
  19. using ::google::protobuf::Message;
  20. using ::google::protobuf::FileDescriptor;
  21. using ::google::protobuf::FileDescriptorProto;
  22. using ::google::protobuf::FileDescriptorSet;
  23. using ::google::protobuf::FieldOptions;
  24. using ::google::protobuf::FileOptions;
  25. using ::google::protobuf::OneofOptions;
  26. using ::google::protobuf::MessageOptions;
  27. using ::ToString;
  28. namespace {
  29. ////////////////////////////////////////////////////////////////////////////////
  30. using TOneofOption = std::variant<
  31. EProtobufOneofMode>;
  32. using TFieldOption = std::variant<
  33. EProtobufType,
  34. EProtobufSerializationMode,
  35. EProtobufListMode,
  36. EProtobufMapMode,
  37. EProtobufEnumWritingMode>;
  38. using TMessageOption = std::variant<
  39. EProtobufFieldSortOrder>;
  40. struct TOtherColumns
  41. { };
  42. using TValueTypeOrOtherColumns = std::variant<EValueType, TOtherColumns>;
  43. ////////////////////////////////////////////////////////////////////////////////
  44. TFieldOption FieldFlagToOption(EWrapperFieldFlag::Enum flag)
  45. {
  46. using EFlag = EWrapperFieldFlag;
  47. switch (flag) {
  48. case EFlag::SERIALIZATION_PROTOBUF:
  49. return EProtobufSerializationMode::Protobuf;
  50. case EFlag::SERIALIZATION_YT:
  51. return EProtobufSerializationMode::Yt;
  52. case EFlag::ANY:
  53. return EProtobufType::Any;
  54. case EFlag::OTHER_COLUMNS:
  55. return EProtobufType::OtherColumns;
  56. case EFlag::ENUM_INT:
  57. return EProtobufType::EnumInt;
  58. case EFlag::ENUM_STRING:
  59. return EProtobufType::EnumString;
  60. case EFlag::OPTIONAL_LIST:
  61. return EProtobufListMode::Optional;
  62. case EFlag::REQUIRED_LIST:
  63. return EProtobufListMode::Required;
  64. case EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY:
  65. return EProtobufMapMode::ListOfStructsLegacy;
  66. case EFlag::MAP_AS_LIST_OF_STRUCTS:
  67. return EProtobufMapMode::ListOfStructs;
  68. case EFlag::MAP_AS_DICT:
  69. return EProtobufMapMode::Dict;
  70. case EFlag::MAP_AS_OPTIONAL_DICT:
  71. return EProtobufMapMode::OptionalDict;
  72. case EFlag::EMBEDDED:
  73. return EProtobufSerializationMode::Embedded;
  74. case EFlag::ENUM_SKIP_UNKNOWN_VALUES:
  75. return EProtobufEnumWritingMode::SkipUnknownValues;
  76. case EFlag::ENUM_CHECK_VALUES:
  77. return EProtobufEnumWritingMode::CheckValues;
  78. }
  79. Y_ABORT();
  80. }
  81. TMessageOption MessageFlagToOption(EWrapperMessageFlag::Enum flag)
  82. {
  83. using EFlag = EWrapperMessageFlag;
  84. switch (flag) {
  85. case EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE:
  86. return EProtobufFieldSortOrder::AsInProtoFile;
  87. case EFlag::SORT_FIELDS_BY_FIELD_NUMBER:
  88. return EProtobufFieldSortOrder::ByFieldNumber;
  89. }
  90. Y_ABORT();
  91. }
  92. TOneofOption OneofFlagToOption(EWrapperOneofFlag::Enum flag)
  93. {
  94. using EFlag = EWrapperOneofFlag;
  95. switch (flag) {
  96. case EFlag::SEPARATE_FIELDS:
  97. return EProtobufOneofMode::SeparateFields;
  98. case EFlag::VARIANT:
  99. return EProtobufOneofMode::Variant;
  100. }
  101. Y_ABORT();
  102. }
  103. EWrapperFieldFlag::Enum OptionToFieldFlag(TFieldOption option)
  104. {
  105. using EFlag = EWrapperFieldFlag;
  106. struct TVisitor
  107. {
  108. EFlag::Enum operator() (EProtobufType type)
  109. {
  110. switch (type) {
  111. case EProtobufType::Any:
  112. return EFlag::ANY;
  113. case EProtobufType::OtherColumns:
  114. return EFlag::OTHER_COLUMNS;
  115. case EProtobufType::EnumInt:
  116. return EFlag::ENUM_INT;
  117. case EProtobufType::EnumString:
  118. return EFlag::ENUM_STRING;
  119. }
  120. Y_ABORT();
  121. }
  122. EFlag::Enum operator() (EProtobufSerializationMode serializationMode)
  123. {
  124. switch (serializationMode) {
  125. case EProtobufSerializationMode::Yt:
  126. return EFlag::SERIALIZATION_YT;
  127. case EProtobufSerializationMode::Protobuf:
  128. return EFlag::SERIALIZATION_PROTOBUF;
  129. case EProtobufSerializationMode::Embedded:
  130. return EFlag::EMBEDDED;
  131. }
  132. Y_ABORT();
  133. }
  134. EFlag::Enum operator() (EProtobufListMode listMode)
  135. {
  136. switch (listMode) {
  137. case EProtobufListMode::Optional:
  138. return EFlag::OPTIONAL_LIST;
  139. case EProtobufListMode::Required:
  140. return EFlag::REQUIRED_LIST;
  141. }
  142. Y_ABORT();
  143. }
  144. EFlag::Enum operator() (EProtobufMapMode mapMode)
  145. {
  146. switch (mapMode) {
  147. case EProtobufMapMode::ListOfStructsLegacy:
  148. return EFlag::MAP_AS_LIST_OF_STRUCTS_LEGACY;
  149. case EProtobufMapMode::ListOfStructs:
  150. return EFlag::MAP_AS_LIST_OF_STRUCTS;
  151. case EProtobufMapMode::Dict:
  152. return EFlag::MAP_AS_DICT;
  153. case EProtobufMapMode::OptionalDict:
  154. return EFlag::MAP_AS_OPTIONAL_DICT;
  155. }
  156. Y_ABORT();
  157. }
  158. EFlag::Enum operator() (EProtobufEnumWritingMode enumWritingMode)
  159. {
  160. switch (enumWritingMode) {
  161. case EProtobufEnumWritingMode::SkipUnknownValues:
  162. return EFlag::ENUM_SKIP_UNKNOWN_VALUES;
  163. case EProtobufEnumWritingMode::CheckValues:
  164. return EFlag::ENUM_CHECK_VALUES;
  165. }
  166. Y_ABORT();
  167. }
  168. };
  169. return std::visit(TVisitor(), option);
  170. }
  171. EWrapperMessageFlag::Enum OptionToMessageFlag(TMessageOption option)
  172. {
  173. using EFlag = EWrapperMessageFlag;
  174. struct TVisitor
  175. {
  176. EFlag::Enum operator() (EProtobufFieldSortOrder sortOrder)
  177. {
  178. switch (sortOrder) {
  179. case EProtobufFieldSortOrder::AsInProtoFile:
  180. return EFlag::DEPRECATED_SORT_FIELDS_AS_IN_PROTO_FILE;
  181. case EProtobufFieldSortOrder::ByFieldNumber:
  182. return EFlag::SORT_FIELDS_BY_FIELD_NUMBER;
  183. }
  184. Y_ABORT();
  185. }
  186. };
  187. return std::visit(TVisitor(), option);
  188. }
  189. EWrapperOneofFlag::Enum OptionToOneofFlag(TOneofOption option)
  190. {
  191. using EFlag = EWrapperOneofFlag;
  192. struct TVisitor
  193. {
  194. EFlag::Enum operator() (EProtobufOneofMode mode)
  195. {
  196. switch (mode) {
  197. case EProtobufOneofMode::SeparateFields:
  198. return EFlag::SEPARATE_FIELDS;
  199. case EProtobufOneofMode::Variant:
  200. return EFlag::VARIANT;
  201. }
  202. Y_ABORT();
  203. }
  204. };
  205. return std::visit(TVisitor(), option);
  206. }
  207. template <typename T, typename TOptionToFlag>
  208. void SetOption(TMaybe<T>& option, T newOption, TOptionToFlag optionToFlag)
  209. {
  210. if (option) {
  211. if (*option == newOption) {
  212. ythrow yexception() << "Duplicate protobuf flag " << optionToFlag(newOption);
  213. } else {
  214. ythrow yexception() << "Incompatible protobuf flags " <<
  215. optionToFlag(*option) << " and " << optionToFlag(newOption);
  216. }
  217. }
  218. option = newOption;
  219. }
  220. class TParseProtobufFieldOptionsVisitor
  221. {
  222. public:
  223. void operator() (EProtobufType type)
  224. {
  225. SetOption(Type, type);
  226. }
  227. void operator() (EProtobufSerializationMode serializationMode)
  228. {
  229. SetOption(SerializationMode, serializationMode);
  230. }
  231. void operator() (EProtobufListMode listMode)
  232. {
  233. SetOption(ListMode, listMode);
  234. }
  235. void operator() (EProtobufMapMode mapMode)
  236. {
  237. SetOption(MapMode, mapMode);
  238. }
  239. void operator() (EProtobufEnumWritingMode enumWritingMode)
  240. {
  241. SetOption(EnumWritingMode, enumWritingMode);
  242. }
  243. template <typename T>
  244. void SetOption(TMaybe<T>& option, T newOption)
  245. {
  246. NYT::NDetail::SetOption(option, newOption, OptionToFieldFlag);
  247. }
  248. public:
  249. TMaybe<EProtobufType> Type;
  250. TMaybe<EProtobufSerializationMode> SerializationMode;
  251. TMaybe<EProtobufListMode> ListMode;
  252. TMaybe<EProtobufMapMode> MapMode;
  253. TMaybe<EProtobufEnumWritingMode> EnumWritingMode;
  254. };
  255. class TParseProtobufMessageOptionsVisitor
  256. {
  257. public:
  258. void operator() (EProtobufFieldSortOrder fieldSortOrder)
  259. {
  260. SetOption(FieldSortOrder, fieldSortOrder);
  261. }
  262. template <typename T>
  263. void SetOption(TMaybe<T>& option, T newOption)
  264. {
  265. NYT::NDetail::SetOption(option, newOption, OptionToMessageFlag);
  266. }
  267. public:
  268. TMaybe<EProtobufFieldSortOrder> FieldSortOrder;
  269. };
  270. class TParseProtobufOneofOptionsVisitor
  271. {
  272. public:
  273. void operator() (EProtobufOneofMode mode)
  274. {
  275. SetOption(Mode, mode);
  276. }
  277. template <typename T>
  278. void SetOption(TMaybe<T>& option, T newOption)
  279. {
  280. NYT::NDetail::SetOption(option, newOption, OptionToOneofFlag);
  281. }
  282. public:
  283. TMaybe<EProtobufOneofMode> Mode;
  284. };
  285. void ParseProtobufFieldOptions(
  286. const ::google::protobuf::RepeatedField<EWrapperFieldFlag::Enum>& flags,
  287. TProtobufFieldOptions* fieldOptions)
  288. {
  289. TParseProtobufFieldOptionsVisitor visitor;
  290. for (auto flag : flags) {
  291. std::visit(visitor, FieldFlagToOption(flag));
  292. }
  293. if (visitor.Type) {
  294. fieldOptions->Type = *visitor.Type;
  295. }
  296. if (visitor.SerializationMode) {
  297. fieldOptions->SerializationMode = *visitor.SerializationMode;
  298. }
  299. if (visitor.ListMode) {
  300. fieldOptions->ListMode = *visitor.ListMode;
  301. }
  302. if (visitor.MapMode) {
  303. fieldOptions->MapMode = *visitor.MapMode;
  304. }
  305. }
  306. void ParseProtobufMessageOptions(
  307. const ::google::protobuf::RepeatedField<EWrapperMessageFlag::Enum>& flags,
  308. TProtobufMessageOptions* messageOptions)
  309. {
  310. TParseProtobufMessageOptionsVisitor visitor;
  311. for (auto flag : flags) {
  312. std::visit(visitor, MessageFlagToOption(flag));
  313. }
  314. if (visitor.FieldSortOrder) {
  315. messageOptions->FieldSortOrder = *visitor.FieldSortOrder;
  316. }
  317. }
  318. void ParseProtobufOneofOptions(
  319. const ::google::protobuf::RepeatedField<EWrapperOneofFlag::Enum>& flags,
  320. TProtobufOneofOptions* messageOptions)
  321. {
  322. TParseProtobufOneofOptionsVisitor visitor;
  323. for (auto flag : flags) {
  324. std::visit(visitor, OneofFlagToOption(flag));
  325. }
  326. if (visitor.Mode) {
  327. messageOptions->Mode = *visitor.Mode;
  328. }
  329. }
  330. TProtobufFieldOptions GetDefaultFieldOptions(
  331. const Descriptor* descriptor,
  332. TProtobufFieldOptions defaultFieldOptions = {})
  333. {
  334. ParseProtobufFieldOptions(
  335. descriptor->file()->options().GetRepeatedExtension(file_default_field_flags),
  336. &defaultFieldOptions);
  337. ParseProtobufFieldOptions(
  338. descriptor->options().GetRepeatedExtension(default_field_flags),
  339. &defaultFieldOptions);
  340. return defaultFieldOptions;
  341. }
  342. TProtobufOneofOptions GetDefaultOneofOptions(const Descriptor* descriptor)
  343. {
  344. TProtobufOneofOptions defaultOneofOptions;
  345. ParseProtobufOneofOptions(
  346. descriptor->file()->options().GetRepeatedExtension(file_default_oneof_flags),
  347. &defaultOneofOptions);
  348. ParseProtobufOneofOptions(
  349. descriptor->options().GetRepeatedExtension(default_oneof_flags),
  350. &defaultOneofOptions);
  351. switch (defaultOneofOptions.Mode) {
  352. case EProtobufOneofMode::Variant: {
  353. auto defaultFieldOptions = GetDefaultFieldOptions(descriptor);
  354. switch (defaultFieldOptions.SerializationMode) {
  355. case EProtobufSerializationMode::Protobuf:
  356. // For Protobuf serialization mode default is SeparateFields.
  357. defaultOneofOptions.Mode = EProtobufOneofMode::SeparateFields;
  358. return defaultOneofOptions;
  359. case EProtobufSerializationMode::Yt:
  360. case EProtobufSerializationMode::Embedded:
  361. return defaultOneofOptions;
  362. }
  363. Y_ABORT();
  364. }
  365. case EProtobufOneofMode::SeparateFields:
  366. return defaultOneofOptions;
  367. }
  368. Y_ABORT();
  369. }
  370. ////////////////////////////////////////////////////////////////////////////////
  371. void ValidateProtobufType(const FieldDescriptor& fieldDescriptor, EProtobufType protobufType)
  372. {
  373. const auto fieldType = fieldDescriptor.type();
  374. auto ensureType = [&] (FieldDescriptor::Type expectedType) {
  375. Y_ENSURE(fieldType == expectedType,
  376. "Type of field " << fieldDescriptor.name() << "does not match specified field flag " <<
  377. OptionToFieldFlag(protobufType) << ": "
  378. "expected " << FieldDescriptor::TypeName(expectedType) << ", " <<
  379. "got " << FieldDescriptor::TypeName(fieldType));
  380. };
  381. switch (protobufType) {
  382. case EProtobufType::Any:
  383. ensureType(FieldDescriptor::TYPE_BYTES);
  384. return;
  385. case EProtobufType::OtherColumns:
  386. ensureType(FieldDescriptor::TYPE_BYTES);
  387. return;
  388. case EProtobufType::EnumInt:
  389. ensureType(FieldDescriptor::TYPE_ENUM);
  390. return;
  391. case EProtobufType::EnumString:
  392. ensureType(FieldDescriptor::TYPE_ENUM);
  393. return;
  394. }
  395. Y_ABORT();
  396. }
  397. ////////////////////////////////////////////////////////////////////////////////
  398. class TCycleChecker
  399. {
  400. private:
  401. class TGuard
  402. {
  403. public:
  404. TGuard(TCycleChecker* checker, const Descriptor* descriptor)
  405. : Checker_(checker)
  406. , Descriptor_(descriptor)
  407. {
  408. Checker_->ActiveVertices_.insert(Descriptor_);
  409. Checker_->Stack_.push(Descriptor_);
  410. }
  411. ~TGuard()
  412. {
  413. Checker_->ActiveVertices_.erase(Descriptor_);
  414. Checker_->Stack_.pop();
  415. }
  416. private:
  417. TCycleChecker* Checker_;
  418. const Descriptor* Descriptor_;
  419. };
  420. public:
  421. [[nodiscard]] TGuard Enter(const Descriptor* descriptor)
  422. {
  423. if (ActiveVertices_.contains(descriptor)) {
  424. Y_ABORT_UNLESS(!Stack_.empty());
  425. ythrow TApiUsageError() << "Cyclic reference found for protobuf messages. " <<
  426. "Consider removing " << EWrapperFieldFlag::SERIALIZATION_YT << " flag " <<
  427. "somewhere on the cycle containing " <<
  428. Stack_.top()->full_name() << " and " << descriptor->full_name();
  429. }
  430. return TGuard(this, descriptor);
  431. }
  432. private:
  433. THashSet<const Descriptor*> ActiveVertices_;
  434. TStack<const Descriptor*> Stack_;
  435. };
  436. ////////////////////////////////////////////////////////////////////////////////
  437. } // namespace
  438. ////////////////////////////////////////////////////////////////////////////////
  439. TProtobufFieldOptions GetFieldOptions(
  440. const FieldDescriptor* fieldDescriptor,
  441. const TMaybe<TProtobufFieldOptions>& defaultFieldOptions)
  442. {
  443. TProtobufFieldOptions options;
  444. if (defaultFieldOptions) {
  445. options = *defaultFieldOptions;
  446. } else {
  447. options = GetDefaultFieldOptions(fieldDescriptor->containing_type());
  448. }
  449. ParseProtobufFieldOptions(fieldDescriptor->options().GetRepeatedExtension(flags), &options);
  450. return options;
  451. }
  452. TProtobufOneofOptions GetOneofOptions(
  453. const OneofDescriptor* oneofDescriptor,
  454. const TMaybe<TProtobufOneofOptions>& defaultOneofOptions)
  455. {
  456. TProtobufOneofOptions options;
  457. if (defaultOneofOptions) {
  458. options = *defaultOneofOptions;
  459. } else {
  460. options = GetDefaultOneofOptions(oneofDescriptor->containing_type());
  461. }
  462. ParseProtobufOneofOptions(oneofDescriptor->options().GetRepeatedExtension(oneof_flags), &options);
  463. if (oneofDescriptor->is_synthetic()) {
  464. options.Mode = EProtobufOneofMode::SeparateFields;
  465. }
  466. auto variantFieldName = oneofDescriptor->options().GetExtension(variant_field_name);
  467. switch (options.Mode) {
  468. case EProtobufOneofMode::SeparateFields:
  469. if (variantFieldName) {
  470. ythrow TApiUsageError() << "\"variant_field_name\" requires (NYT.oneof_flags) = VARIANT";
  471. }
  472. break;
  473. case EProtobufOneofMode::Variant:
  474. if (variantFieldName) {
  475. options.VariantFieldName = variantFieldName;
  476. } else {
  477. options.VariantFieldName = oneofDescriptor->name();
  478. }
  479. break;
  480. }
  481. return options;
  482. }
  483. TProtobufMessageOptions GetMessageOptions(const Descriptor* descriptor)
  484. {
  485. TProtobufMessageOptions options;
  486. ParseProtobufMessageOptions(
  487. descriptor->file()->options().GetRepeatedExtension(file_default_message_flags),
  488. &options);
  489. ParseProtobufMessageOptions(
  490. descriptor->options().GetRepeatedExtension(message_flags),
  491. &options);
  492. return options;
  493. }
  494. TNode MakeEnumerationConfig(const ::google::protobuf::EnumDescriptor* enumDescriptor)
  495. {
  496. auto config = TNode::CreateMap();
  497. for (int i = 0; i < enumDescriptor->value_count(); ++i) {
  498. config[enumDescriptor->value(i)->name()] = enumDescriptor->value(i)->number();
  499. }
  500. return config;
  501. }
  502. TString DeduceProtobufType(
  503. const FieldDescriptor* fieldDescriptor,
  504. const TProtobufFieldOptions& options)
  505. {
  506. if (options.Type) {
  507. ValidateProtobufType(*fieldDescriptor, *options.Type);
  508. return ToString(*options.Type);
  509. }
  510. switch (fieldDescriptor->type()) {
  511. case FieldDescriptor::TYPE_ENUM:
  512. return ToString(EProtobufType::EnumString);
  513. case FieldDescriptor::TYPE_MESSAGE:
  514. switch (options.SerializationMode) {
  515. case EProtobufSerializationMode::Protobuf:
  516. return "message";
  517. case EProtobufSerializationMode::Yt:
  518. return "structured_message";
  519. case EProtobufSerializationMode::Embedded:
  520. return "embedded_message";
  521. }
  522. Y_ABORT();
  523. default:
  524. return fieldDescriptor->type_name();
  525. }
  526. Y_ABORT();
  527. }
  528. TString GetColumnName(const ::google::protobuf::FieldDescriptor& field)
  529. {
  530. const auto& options = field.options();
  531. const auto columnName = options.GetExtension(column_name);
  532. if (!columnName.empty()) {
  533. return columnName;
  534. }
  535. const auto keyColumnName = options.GetExtension(key_column_name);
  536. if (!keyColumnName.empty()) {
  537. return keyColumnName;
  538. }
  539. return field.name();
  540. }
  541. TNode MakeProtoFormatMessageFieldsConfig(
  542. const Descriptor* descriptor,
  543. TNode* enumerations,
  544. TCycleChecker& cycleChecker);
  545. TNode MakeProtoFormatMessageFieldsConfig(
  546. const Descriptor* descriptor,
  547. TNode* enumerations,
  548. const TProtobufFieldOptions& defaultFieldOptions,
  549. const TProtobufOneofOptions& defaultOneofOptions,
  550. TCycleChecker& cycleChecker);
  551. TNode MakeMapFieldsConfig(
  552. const FieldDescriptor* fieldDescriptor,
  553. TNode* enumerations,
  554. const TProtobufFieldOptions& fieldOptions,
  555. TCycleChecker& cycleChecker)
  556. {
  557. Y_ABORT_UNLESS(fieldDescriptor->is_map());
  558. auto message = fieldDescriptor->message_type();
  559. switch (fieldOptions.MapMode) {
  560. case EProtobufMapMode::ListOfStructsLegacy:
  561. return MakeProtoFormatMessageFieldsConfig(
  562. message,
  563. enumerations,
  564. cycleChecker);
  565. case EProtobufMapMode::ListOfStructs:
  566. case EProtobufMapMode::Dict:
  567. case EProtobufMapMode::OptionalDict: {
  568. TProtobufFieldOptions defaultFieldOptions;
  569. defaultFieldOptions.SerializationMode = EProtobufSerializationMode::Yt;
  570. return MakeProtoFormatMessageFieldsConfig(
  571. message,
  572. enumerations,
  573. defaultFieldOptions,
  574. TProtobufOneofOptions{},
  575. cycleChecker);
  576. }
  577. }
  578. Y_ABORT();
  579. }
  580. TNode MakeProtoFormatFieldConfig(
  581. const FieldDescriptor* fieldDescriptor,
  582. TNode* enumerations,
  583. const TProtobufFieldOptions& defaultOptions,
  584. TCycleChecker& cycleChecker)
  585. {
  586. auto fieldConfig = TNode::CreateMap();
  587. fieldConfig["field_number"] = fieldDescriptor->number();
  588. fieldConfig["name"] = GetColumnName(*fieldDescriptor);
  589. auto fieldOptions = GetFieldOptions(fieldDescriptor, defaultOptions);
  590. Y_ENSURE(fieldOptions.SerializationMode != EProtobufSerializationMode::Embedded,
  591. "EMBEDDED flag is currently supported only with "
  592. "ProtobufFormatWithDescriptors config option set to true");
  593. if (fieldDescriptor->is_repeated()) {
  594. Y_ENSURE_EX(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
  595. TApiUsageError() << "Repeated field \"" << fieldDescriptor->full_name() << "\" " <<
  596. "must have flag \"" << EWrapperFieldFlag::SERIALIZATION_YT << "\"");
  597. }
  598. fieldConfig["repeated"] = fieldDescriptor->is_repeated();
  599. fieldConfig["packed"] = fieldDescriptor->is_packed();
  600. fieldConfig["proto_type"] = DeduceProtobufType(fieldDescriptor, fieldOptions);
  601. if (fieldDescriptor->type() == FieldDescriptor::TYPE_ENUM) {
  602. auto* enumeration = fieldDescriptor->enum_type();
  603. (*enumerations)[enumeration->full_name()] = MakeEnumerationConfig(enumeration);
  604. fieldConfig["enumeration_name"] = enumeration->full_name();
  605. }
  606. if (fieldOptions.SerializationMode != EProtobufSerializationMode::Yt) {
  607. return fieldConfig;
  608. }
  609. if (fieldDescriptor->is_map()) {
  610. fieldConfig["fields"] = MakeMapFieldsConfig(fieldDescriptor, enumerations, fieldOptions, cycleChecker);
  611. return fieldConfig;
  612. }
  613. if (fieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE) {
  614. fieldConfig["fields"] = MakeProtoFormatMessageFieldsConfig(
  615. fieldDescriptor->message_type(),
  616. enumerations,
  617. cycleChecker);
  618. }
  619. return fieldConfig;
  620. }
  621. void MakeProtoFormatOneofConfig(
  622. const OneofDescriptor* oneofDescriptor,
  623. TNode* enumerations,
  624. const TProtobufFieldOptions& defaultFieldOptions,
  625. const TProtobufOneofOptions& defaultOneofOptions,
  626. TCycleChecker& cycleChecker,
  627. TNode* fields)
  628. {
  629. auto addFields = [&] (TNode* fields) {
  630. for (int i = 0; i < oneofDescriptor->field_count(); ++i) {
  631. fields->Add(MakeProtoFormatFieldConfig(
  632. oneofDescriptor->field(i),
  633. enumerations,
  634. defaultFieldOptions,
  635. cycleChecker));
  636. }
  637. };
  638. auto oneofOptions = GetOneofOptions(oneofDescriptor, defaultOneofOptions);
  639. switch (oneofOptions.Mode) {
  640. case EProtobufOneofMode::SeparateFields:
  641. addFields(fields);
  642. return;
  643. case EProtobufOneofMode::Variant: {
  644. auto oneofFields = TNode::CreateList();
  645. addFields(&oneofFields);
  646. auto oneofField = TNode()
  647. ("proto_type", "oneof")
  648. ("name", oneofOptions.VariantFieldName)
  649. ("fields", std::move(oneofFields));
  650. fields->Add(std::move(oneofField));
  651. return;
  652. }
  653. }
  654. Y_ABORT();
  655. }
  656. TNode MakeProtoFormatMessageFieldsConfig(
  657. const Descriptor* descriptor,
  658. TNode* enumerations,
  659. const TProtobufFieldOptions& defaultFieldOptions,
  660. const TProtobufOneofOptions& defaultOneofOptions,
  661. TCycleChecker& cycleChecker)
  662. {
  663. auto fields = TNode::CreateList();
  664. THashSet<const OneofDescriptor*> visitedOneofs;
  665. auto guard = cycleChecker.Enter(descriptor);
  666. for (int fieldIndex = 0; fieldIndex < descriptor->field_count(); ++fieldIndex) {
  667. auto fieldDescriptor = descriptor->field(fieldIndex);
  668. auto oneofDescriptor = fieldDescriptor->containing_oneof();
  669. if (!oneofDescriptor) {
  670. fields.Add(MakeProtoFormatFieldConfig(
  671. fieldDescriptor,
  672. enumerations,
  673. defaultFieldOptions,
  674. cycleChecker));
  675. } else if (!visitedOneofs.contains(oneofDescriptor)) {
  676. MakeProtoFormatOneofConfig(
  677. oneofDescriptor,
  678. enumerations,
  679. defaultFieldOptions,
  680. defaultOneofOptions,
  681. cycleChecker,
  682. &fields);
  683. visitedOneofs.insert(oneofDescriptor);
  684. }
  685. }
  686. return fields;
  687. }
  688. TNode MakeProtoFormatMessageFieldsConfig(
  689. const Descriptor* descriptor,
  690. TNode* enumerations,
  691. TCycleChecker& cycleChecker)
  692. {
  693. return MakeProtoFormatMessageFieldsConfig(
  694. descriptor,
  695. enumerations,
  696. GetDefaultFieldOptions(descriptor),
  697. GetDefaultOneofOptions(descriptor),
  698. cycleChecker);
  699. }
  700. TNode MakeProtoFormatConfigWithTables(const TVector<const Descriptor*>& descriptors)
  701. {
  702. TNode config("protobuf");
  703. config.Attributes()
  704. ("enumerations", TNode::CreateMap())
  705. ("tables", TNode::CreateList());
  706. auto& enumerations = config.Attributes()["enumerations"];
  707. for (auto* descriptor : descriptors) {
  708. TCycleChecker cycleChecker;
  709. auto columns = MakeProtoFormatMessageFieldsConfig(descriptor, &enumerations, cycleChecker);
  710. config.Attributes()["tables"].Add(
  711. TNode()("columns", std::move(columns)));
  712. }
  713. return config;
  714. }
  715. ////////////////////////////////////////////////////////////////////////////////
  716. class TFileDescriptorSetBuilder
  717. {
  718. public:
  719. TFileDescriptorSetBuilder()
  720. : ExtensionFile_(EWrapperFieldFlag::descriptor()->file())
  721. { }
  722. void AddDescriptor(const Descriptor* descriptor)
  723. {
  724. auto [it, inserted] = AllDescriptors_.insert(descriptor);
  725. if (!inserted) {
  726. return;
  727. }
  728. const auto* containingType = descriptor->containing_type();
  729. while (containingType) {
  730. AddDescriptor(containingType);
  731. containingType = containingType->containing_type();
  732. }
  733. for (int i = 0; i < descriptor->field_count(); ++i) {
  734. AddField(descriptor->field(i));
  735. }
  736. }
  737. FileDescriptorSet Build()
  738. {
  739. THashSet<const FileDescriptor*> visitedFiles;
  740. TVector<const FileDescriptor*> fileTopoOrder;
  741. for (const auto* descriptor : AllDescriptors_) {
  742. TraverseDependencies(descriptor->file(), visitedFiles, fileTopoOrder);
  743. }
  744. THashSet<TString> messageTypeNames;
  745. THashSet<TString> enumTypeNames;
  746. for (const auto* descriptor : AllDescriptors_) {
  747. messageTypeNames.insert(descriptor->full_name());
  748. }
  749. for (const auto* enumDescriptor : EnumDescriptors_) {
  750. enumTypeNames.insert(enumDescriptor->full_name());
  751. }
  752. FileDescriptorSet fileDescriptorSetProto;
  753. for (const auto* file : fileTopoOrder) {
  754. auto* fileProto = fileDescriptorSetProto.add_file();
  755. file->CopyTo(fileProto);
  756. Strip(fileProto, messageTypeNames, enumTypeNames);
  757. }
  758. return fileDescriptorSetProto;
  759. }
  760. private:
  761. void AddField(const FieldDescriptor* fieldDescriptor)
  762. {
  763. if (fieldDescriptor->message_type()) {
  764. AddDescriptor(fieldDescriptor->message_type());
  765. }
  766. if (fieldDescriptor->enum_type()) {
  767. AddEnumDescriptor(fieldDescriptor->enum_type());
  768. }
  769. }
  770. void AddEnumDescriptor(const EnumDescriptor* enumDescriptor)
  771. {
  772. auto [it, inserted] = EnumDescriptors_.insert(enumDescriptor);
  773. if (!inserted) {
  774. return;
  775. }
  776. const auto* containingType = enumDescriptor->containing_type();
  777. while (containingType) {
  778. AddDescriptor(containingType);
  779. containingType = containingType->containing_type();
  780. }
  781. }
  782. void TraverseDependencies(
  783. const FileDescriptor* current,
  784. THashSet<const FileDescriptor*>& visited,
  785. TVector<const FileDescriptor*>& topoOrder)
  786. {
  787. auto [it, inserted] = visited.insert(current);
  788. if (!inserted) {
  789. return;
  790. }
  791. for (int i = 0; i < current->dependency_count(); ++i) {
  792. TraverseDependencies(current->dependency(i), visited, topoOrder);
  793. }
  794. topoOrder.push_back(current);
  795. }
  796. template <typename TOptions>
  797. void StripUnknownOptions(TOptions* options)
  798. {
  799. std::vector<const FieldDescriptor*> fields;
  800. auto reflection = options->GetReflection();
  801. reflection->ListFields(*options, &fields);
  802. for (auto field : fields) {
  803. if (field->is_extension() && field->file() != ExtensionFile_) {
  804. reflection->ClearField(options, field);
  805. }
  806. }
  807. }
  808. template <typename TRepeatedField, typename TPredicate>
  809. void RemoveIf(TRepeatedField* repeatedField, TPredicate predicate)
  810. {
  811. repeatedField->erase(
  812. std::remove_if(repeatedField->begin(), repeatedField->end(), predicate),
  813. repeatedField->end());
  814. }
  815. void Strip(
  816. const TString& containingTypePrefix,
  817. DescriptorProto* messageProto,
  818. const THashSet<TString>& messageTypeNames,
  819. const THashSet<TString>& enumTypeNames)
  820. {
  821. const auto prefix = containingTypePrefix + messageProto->name() + '.';
  822. RemoveIf(messageProto->mutable_nested_type(), [&] (const DescriptorProto& descriptorProto) {
  823. return !messageTypeNames.contains(prefix + descriptorProto.name());
  824. });
  825. RemoveIf(messageProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
  826. return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
  827. });
  828. messageProto->clear_extension();
  829. StripUnknownOptions(messageProto->mutable_options());
  830. for (auto& fieldProto : *messageProto->mutable_field()) {
  831. StripUnknownOptions(fieldProto.mutable_options());
  832. }
  833. for (auto& oneofProto : *messageProto->mutable_oneof_decl()) {
  834. StripUnknownOptions(oneofProto.mutable_options());
  835. }
  836. for (auto& nestedTypeProto : *messageProto->mutable_nested_type()) {
  837. Strip(prefix, &nestedTypeProto, messageTypeNames, enumTypeNames);
  838. }
  839. for (auto& enumProto : *messageProto->mutable_enum_type()) {
  840. StripUnknownOptions(enumProto.mutable_options());
  841. for (auto& enumValue : *enumProto.mutable_value()) {
  842. StripUnknownOptions(enumValue.mutable_options());
  843. }
  844. }
  845. }
  846. void Strip(
  847. FileDescriptorProto* fileProto,
  848. const THashSet<TString>& messageTypeNames,
  849. const THashSet<TString>& enumTypeNames)
  850. {
  851. const auto prefix = fileProto->package().Empty()
  852. ? ""
  853. : fileProto->package() + '.';
  854. RemoveIf(fileProto->mutable_message_type(), [&] (const DescriptorProto& descriptorProto) {
  855. return !messageTypeNames.contains(prefix + descriptorProto.name());
  856. });
  857. RemoveIf(fileProto->mutable_enum_type(), [&] (const EnumDescriptorProto& enumDescriptorProto) {
  858. return !enumTypeNames.contains(prefix + enumDescriptorProto.name());
  859. });
  860. fileProto->clear_service();
  861. fileProto->clear_extension();
  862. StripUnknownOptions(fileProto->mutable_options());
  863. for (auto& messageProto : *fileProto->mutable_message_type()) {
  864. Strip(prefix, &messageProto, messageTypeNames, enumTypeNames);
  865. }
  866. for (auto& enumProto : *fileProto->mutable_enum_type()) {
  867. StripUnknownOptions(enumProto.mutable_options());
  868. for (auto& enumValue : *enumProto.mutable_value()) {
  869. StripUnknownOptions(enumValue.mutable_options());
  870. }
  871. }
  872. }
  873. private:
  874. const FileDescriptor* const ExtensionFile_;
  875. THashSet<const Descriptor*> AllDescriptors_;
  876. THashSet<const EnumDescriptor*> EnumDescriptors_;
  877. };
  878. TNode MakeProtoFormatConfigWithDescriptors(const TVector<const Descriptor*>& descriptors)
  879. {
  880. TFileDescriptorSetBuilder builder;
  881. auto typeNames = TNode::CreateList();
  882. for (const auto* descriptor : descriptors) {
  883. builder.AddDescriptor(descriptor);
  884. typeNames.Add(descriptor->full_name());
  885. }
  886. auto fileDescriptorSetText = builder.Build().ShortDebugString();
  887. TNode config("protobuf");
  888. config.Attributes()
  889. ("file_descriptor_set_text", std::move(fileDescriptorSetText))
  890. ("type_names", std::move(typeNames));
  891. return config;
  892. }
  893. ////////////////////////////////////////////////////////////////////////////////
  894. using TTypePtrOrOtherColumns = std::variant<NTi::TTypePtr, TOtherColumns>;
  895. struct TMember {
  896. TString Name;
  897. TTypePtrOrOtherColumns TypeOrOtherColumns;
  898. };
  899. ////////////////////////////////////////////////////////////////////////////////
  900. TValueTypeOrOtherColumns GetScalarFieldType(
  901. const FieldDescriptor& fieldDescriptor,
  902. const TProtobufFieldOptions& options)
  903. {
  904. if (options.Type) {
  905. switch (*options.Type) {
  906. case EProtobufType::EnumInt:
  907. return EValueType::VT_INT64;
  908. case EProtobufType::EnumString:
  909. return EValueType::VT_STRING;
  910. case EProtobufType::Any:
  911. return EValueType::VT_ANY;
  912. case EProtobufType::OtherColumns:
  913. return TOtherColumns{};
  914. }
  915. Y_ABORT();
  916. }
  917. switch (fieldDescriptor.cpp_type()) {
  918. case FieldDescriptor::CPPTYPE_INT32:
  919. return EValueType::VT_INT32;
  920. case FieldDescriptor::CPPTYPE_INT64:
  921. return EValueType::VT_INT64;
  922. case FieldDescriptor::CPPTYPE_UINT32:
  923. return EValueType::VT_UINT32;
  924. case FieldDescriptor::CPPTYPE_UINT64:
  925. return EValueType::VT_UINT64;
  926. case FieldDescriptor::CPPTYPE_FLOAT:
  927. case FieldDescriptor::CPPTYPE_DOUBLE:
  928. return EValueType::VT_DOUBLE;
  929. case FieldDescriptor::CPPTYPE_BOOL:
  930. return EValueType::VT_BOOLEAN;
  931. case FieldDescriptor::CPPTYPE_STRING:
  932. case FieldDescriptor::CPPTYPE_MESSAGE:
  933. case FieldDescriptor::CPPTYPE_ENUM:
  934. return EValueType::VT_STRING;
  935. default:
  936. ythrow yexception() <<
  937. "Unexpected field type '" << fieldDescriptor.cpp_type_name() << "' " <<
  938. "for field " << fieldDescriptor.name();
  939. }
  940. }
  941. bool HasNameExtension(const FieldDescriptor& fieldDescriptor)
  942. {
  943. const auto& options = fieldDescriptor.options();
  944. return options.HasExtension(column_name) || options.HasExtension(key_column_name);
  945. }
  946. void SortFields(TVector<const FieldDescriptor*>& fieldDescriptors, EProtobufFieldSortOrder fieldSortOrder)
  947. {
  948. switch (fieldSortOrder) {
  949. case EProtobufFieldSortOrder::AsInProtoFile:
  950. return;
  951. case EProtobufFieldSortOrder::ByFieldNumber:
  952. SortBy(fieldDescriptors, [] (const FieldDescriptor* fieldDescriptor) {
  953. return fieldDescriptor->number();
  954. });
  955. return;
  956. }
  957. Y_ABORT();
  958. }
  959. NTi::TTypePtr CreateStruct(TStringBuf fieldName, TVector<TMember> members)
  960. {
  961. TVector<NTi::TStructType::TOwnedMember> structMembers;
  962. structMembers.reserve(members.size());
  963. for (auto& member : members) {
  964. std::visit(TOverloaded{
  965. [&] (TOtherColumns) {
  966. ythrow TApiUsageError() <<
  967. "Could not deduce YT type for field " << member.Name << " of " <<
  968. "embedded message field " << fieldName << " " <<
  969. "(note that " << EWrapperFieldFlag::OTHER_COLUMNS << " fields " <<
  970. "are not allowed inside embedded messages)";
  971. },
  972. [&] (NTi::TTypePtr& type) {
  973. structMembers.emplace_back(std::move(member.Name), std::move(type));
  974. },
  975. }, member.TypeOrOtherColumns);
  976. }
  977. return NTi::Struct(std::move(structMembers));
  978. }
  979. TMaybe<TVector<TString>> InferColumnFilter(const ::google::protobuf::Descriptor& descriptor)
  980. {
  981. auto isOtherColumns = [] (const ::google::protobuf::FieldDescriptor& field) {
  982. return GetFieldOptions(&field).Type == EProtobufType::OtherColumns;
  983. };
  984. TVector<TString> result;
  985. result.reserve(descriptor.field_count());
  986. for (int i = 0; i < descriptor.field_count(); ++i) {
  987. const auto& field = *descriptor.field(i);
  988. if (isOtherColumns(field)) {
  989. return {};
  990. }
  991. result.push_back(GetColumnName(field));
  992. }
  993. return result;
  994. }
  995. ////////////////////////////////////////////////////////////////////////////////
  996. class TTableSchemaInferrer
  997. {
  998. public:
  999. TTableSchemaInferrer(bool keepFieldsWithoutExtension)
  1000. : KeepFieldsWithoutExtension_(keepFieldsWithoutExtension)
  1001. { }
  1002. TTableSchema InferSchema(const Descriptor& messageDescriptor);
  1003. private:
  1004. TTypePtrOrOtherColumns GetFieldType(
  1005. const FieldDescriptor& fieldDescriptor,
  1006. const TProtobufFieldOptions& defaultOptions);
  1007. void ProcessOneofField(
  1008. TStringBuf containingFieldName,
  1009. const OneofDescriptor& oneofDescriptor,
  1010. const TProtobufFieldOptions& defaultFieldOptions,
  1011. const TProtobufOneofOptions& defaultOneofOptions,
  1012. EProtobufFieldSortOrder fieldSortOrder,
  1013. TVector<TMember>* members);
  1014. TVector<TMember> GetMessageMembers(
  1015. TStringBuf containingFieldName,
  1016. const Descriptor& fieldDescriptor,
  1017. TProtobufFieldOptions defaultFieldOptions,
  1018. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder = std::nullopt);
  1019. NTi::TTypePtr GetMessageType(
  1020. const FieldDescriptor& fieldDescriptor,
  1021. TProtobufFieldOptions defaultFieldOptions);
  1022. NTi::TTypePtr GetMapType(
  1023. const FieldDescriptor& fieldDescriptor,
  1024. const TProtobufFieldOptions& fieldOptions);
  1025. private:
  1026. void GetMessageMembersImpl(
  1027. TStringBuf containingFieldName,
  1028. const Descriptor& fieldDescriptor,
  1029. TProtobufFieldOptions defaultFieldOptions,
  1030. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
  1031. TVector<TMember>* members);
  1032. private:
  1033. const bool KeepFieldsWithoutExtension_;
  1034. TCycleChecker CycleChecker_;
  1035. };
  1036. void TTableSchemaInferrer::ProcessOneofField(
  1037. TStringBuf containingFieldName,
  1038. const OneofDescriptor& oneofDescriptor,
  1039. const TProtobufFieldOptions& defaultFieldOptions,
  1040. const TProtobufOneofOptions& defaultOneofOptions,
  1041. EProtobufFieldSortOrder fieldSortOrder,
  1042. TVector<TMember>* members)
  1043. {
  1044. auto oneofOptions = GetOneofOptions(&oneofDescriptor, defaultOneofOptions);
  1045. auto addFields = [&] (TVector<TMember>* members, bool removeOptionality) {
  1046. TVector<const FieldDescriptor*> fieldDescriptors;
  1047. for (int i = 0; i < oneofDescriptor.field_count(); ++i) {
  1048. fieldDescriptors.push_back(oneofDescriptor.field(i));
  1049. }
  1050. SortFields(fieldDescriptors, fieldSortOrder);
  1051. for (auto innerFieldDescriptor : fieldDescriptors) {
  1052. auto typeOrOtherColumns = GetFieldType(
  1053. *innerFieldDescriptor,
  1054. defaultFieldOptions);
  1055. if (auto* maybeType = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1056. maybeType && removeOptionality && (*maybeType)->IsOptional())
  1057. {
  1058. typeOrOtherColumns = (*maybeType)->AsOptional()->GetItemType();
  1059. }
  1060. members->push_back(TMember{
  1061. GetColumnName(*innerFieldDescriptor),
  1062. std::move(typeOrOtherColumns),
  1063. });
  1064. }
  1065. };
  1066. switch (oneofOptions.Mode) {
  1067. case EProtobufOneofMode::SeparateFields:
  1068. addFields(members, /* removeOptionality */ false);
  1069. return;
  1070. case EProtobufOneofMode::Variant: {
  1071. TVector<TMember> variantMembers;
  1072. addFields(&variantMembers, /* removeOptionality */ true);
  1073. members->push_back(TMember{
  1074. oneofOptions.VariantFieldName,
  1075. NTi::Optional(
  1076. NTi::Variant(
  1077. CreateStruct(containingFieldName, std::move(variantMembers))
  1078. )
  1079. )
  1080. });
  1081. return;
  1082. }
  1083. }
  1084. Y_ABORT();
  1085. }
  1086. TVector<TMember> TTableSchemaInferrer::GetMessageMembers(
  1087. TStringBuf containingFieldName,
  1088. const Descriptor& messageDescriptor,
  1089. TProtobufFieldOptions defaultFieldOptions,
  1090. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder)
  1091. {
  1092. TVector<TMember> members;
  1093. GetMessageMembersImpl(
  1094. containingFieldName,
  1095. messageDescriptor,
  1096. defaultFieldOptions,
  1097. overrideFieldSortOrder,
  1098. &members
  1099. );
  1100. return members;
  1101. }
  1102. void TTableSchemaInferrer::GetMessageMembersImpl(
  1103. TStringBuf containingFieldName,
  1104. const Descriptor& messageDescriptor,
  1105. TProtobufFieldOptions defaultFieldOptions,
  1106. std::optional<EProtobufFieldSortOrder> overrideFieldSortOrder,
  1107. TVector<TMember>* members)
  1108. {
  1109. auto guard = CycleChecker_.Enter(&messageDescriptor);
  1110. defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor, defaultFieldOptions);
  1111. auto messageOptions = GetMessageOptions(&messageDescriptor);
  1112. auto defaultOneofOptions = GetDefaultOneofOptions(&messageDescriptor);
  1113. TVector<const FieldDescriptor*> fieldDescriptors;
  1114. fieldDescriptors.reserve(messageDescriptor.field_count());
  1115. for (int i = 0; i < messageDescriptor.field_count(); ++i) {
  1116. if (!KeepFieldsWithoutExtension_ && !HasNameExtension(*messageDescriptor.field(i))) {
  1117. continue;
  1118. }
  1119. fieldDescriptors.push_back(messageDescriptor.field(i));
  1120. }
  1121. auto fieldSortOrder = overrideFieldSortOrder.value_or(messageOptions.FieldSortOrder);
  1122. SortFields(fieldDescriptors, fieldSortOrder);
  1123. THashSet<const OneofDescriptor*> visitedOneofs;
  1124. for (const auto innerFieldDescriptor : fieldDescriptors) {
  1125. auto oneofDescriptor = innerFieldDescriptor->containing_oneof();
  1126. if (oneofDescriptor) {
  1127. if (visitedOneofs.contains(oneofDescriptor)) {
  1128. continue;
  1129. }
  1130. ProcessOneofField(
  1131. containingFieldName,
  1132. *oneofDescriptor,
  1133. defaultFieldOptions,
  1134. defaultOneofOptions,
  1135. messageOptions.FieldSortOrder,
  1136. members);
  1137. visitedOneofs.insert(oneofDescriptor);
  1138. continue;
  1139. }
  1140. auto fieldOptions = GetFieldOptions(innerFieldDescriptor, defaultFieldOptions);
  1141. if (fieldOptions.SerializationMode == EProtobufSerializationMode::Embedded) {
  1142. Y_ENSURE(innerFieldDescriptor->type() == FieldDescriptor::TYPE_MESSAGE,
  1143. "EMBEDDED column must have message type");
  1144. Y_ENSURE(innerFieldDescriptor->label() == FieldDescriptor::LABEL_REQUIRED,
  1145. "EMBEDDED column must be marked required");
  1146. GetMessageMembersImpl(
  1147. innerFieldDescriptor->full_name(),
  1148. *innerFieldDescriptor->message_type(),
  1149. defaultFieldOptions,
  1150. /*overrideFieldSortOrder*/ std::nullopt,
  1151. members);
  1152. } else {
  1153. auto typeOrOtherColumns = GetFieldType(
  1154. *innerFieldDescriptor,
  1155. defaultFieldOptions);
  1156. members->push_back(TMember{
  1157. GetColumnName(*innerFieldDescriptor),
  1158. std::move(typeOrOtherColumns),
  1159. });
  1160. }
  1161. }
  1162. }
  1163. NTi::TTypePtr TTableSchemaInferrer::GetMessageType(
  1164. const FieldDescriptor& fieldDescriptor,
  1165. TProtobufFieldOptions defaultFieldOptions)
  1166. {
  1167. Y_ABORT_UNLESS(fieldDescriptor.message_type());
  1168. const auto& messageDescriptor = *fieldDescriptor.message_type();
  1169. auto members = GetMessageMembers(
  1170. fieldDescriptor.full_name(),
  1171. messageDescriptor,
  1172. defaultFieldOptions);
  1173. return CreateStruct(fieldDescriptor.full_name(), std::move(members));
  1174. }
  1175. NTi::TTypePtr TTableSchemaInferrer::GetMapType(
  1176. const FieldDescriptor& fieldDescriptor,
  1177. const TProtobufFieldOptions& fieldOptions)
  1178. {
  1179. Y_ABORT_UNLESS(fieldDescriptor.is_map());
  1180. switch (fieldOptions.MapMode) {
  1181. case EProtobufMapMode::ListOfStructsLegacy:
  1182. case EProtobufMapMode::ListOfStructs: {
  1183. TProtobufFieldOptions embeddedOptions;
  1184. if (fieldOptions.MapMode == EProtobufMapMode::ListOfStructs) {
  1185. embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
  1186. }
  1187. auto list = NTi::List(GetMessageType(fieldDescriptor, embeddedOptions));
  1188. switch (fieldOptions.ListMode) {
  1189. case EProtobufListMode::Required:
  1190. return list;
  1191. case EProtobufListMode::Optional:
  1192. return NTi::Optional(std::move(list));
  1193. }
  1194. Y_ABORT();
  1195. }
  1196. case EProtobufMapMode::Dict:
  1197. case EProtobufMapMode::OptionalDict: {
  1198. auto message = fieldDescriptor.message_type();
  1199. Y_ABORT_UNLESS(message->field_count() == 2);
  1200. auto keyVariant = GetScalarFieldType(*message->field(0), TProtobufFieldOptions{});
  1201. Y_ABORT_UNLESS(std::holds_alternative<EValueType>(keyVariant));
  1202. auto key = std::get<EValueType>(keyVariant);
  1203. TProtobufFieldOptions embeddedOptions;
  1204. embeddedOptions.SerializationMode = EProtobufSerializationMode::Yt;
  1205. auto valueVariant = GetFieldType(*message->field(1), embeddedOptions);
  1206. Y_ABORT_UNLESS(std::holds_alternative<NTi::TTypePtr>(valueVariant));
  1207. auto value = std::get<NTi::TTypePtr>(valueVariant);
  1208. Y_ABORT_UNLESS(value->IsOptional());
  1209. value = value->AsOptional()->GetItemType();
  1210. auto dict = NTi::Dict(ToTypeV3(key, true), value);
  1211. if (fieldOptions.MapMode == EProtobufMapMode::OptionalDict) {
  1212. return NTi::Optional(dict);
  1213. } else {
  1214. return dict;
  1215. }
  1216. }
  1217. }
  1218. }
  1219. TTypePtrOrOtherColumns TTableSchemaInferrer::GetFieldType(
  1220. const FieldDescriptor& fieldDescriptor,
  1221. const TProtobufFieldOptions& defaultOptions)
  1222. {
  1223. auto fieldOptions = GetFieldOptions(&fieldDescriptor, defaultOptions);
  1224. if (fieldOptions.Type) {
  1225. ValidateProtobufType(fieldDescriptor, *fieldOptions.Type);
  1226. }
  1227. auto getScalarType = [&] {
  1228. auto valueTypeOrOtherColumns = GetScalarFieldType(fieldDescriptor, fieldOptions);
  1229. return std::visit(TOverloaded{
  1230. [] (TOtherColumns) -> TTypePtrOrOtherColumns {
  1231. return TOtherColumns{};
  1232. },
  1233. [] (EValueType valueType) -> TTypePtrOrOtherColumns {
  1234. return ToTypeV3(valueType, true);
  1235. }
  1236. }, valueTypeOrOtherColumns);
  1237. };
  1238. auto withFieldLabel = [&] (const TTypePtrOrOtherColumns& typeOrOtherColumns) -> TTypePtrOrOtherColumns {
  1239. switch (fieldDescriptor.label()) {
  1240. case FieldDescriptor::Label::LABEL_REPEATED: {
  1241. Y_ENSURE(fieldOptions.SerializationMode == EProtobufSerializationMode::Yt,
  1242. "Repeated fields are supported only for YT serialization mode, field \"" + fieldDescriptor.full_name() +
  1243. "\" has incorrect serialization mode");
  1244. auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1245. Y_ENSURE(type, "OTHER_COLUMNS field can not be repeated");
  1246. switch (fieldOptions.ListMode) {
  1247. case EProtobufListMode::Required:
  1248. return NTi::TTypePtr(NTi::List(*type));
  1249. case EProtobufListMode::Optional:
  1250. return NTi::TTypePtr(NTi::Optional(NTi::List(*type)));
  1251. }
  1252. Y_ABORT();
  1253. }
  1254. case FieldDescriptor::Label::LABEL_OPTIONAL:
  1255. return std::visit(TOverloaded{
  1256. [] (TOtherColumns) -> TTypePtrOrOtherColumns {
  1257. return TOtherColumns{};
  1258. },
  1259. [] (NTi::TTypePtr type) -> TTypePtrOrOtherColumns {
  1260. return NTi::TTypePtr(NTi::Optional(std::move(type)));
  1261. }
  1262. }, typeOrOtherColumns);
  1263. case FieldDescriptor::LABEL_REQUIRED: {
  1264. auto* type = std::get_if<NTi::TTypePtr>(&typeOrOtherColumns);
  1265. Y_ENSURE(type, "OTHER_COLUMNS field can not be required");
  1266. return *type;
  1267. }
  1268. }
  1269. Y_ABORT();
  1270. };
  1271. switch (fieldOptions.SerializationMode) {
  1272. case EProtobufSerializationMode::Protobuf:
  1273. return withFieldLabel(getScalarType());
  1274. case EProtobufSerializationMode::Yt:
  1275. if (fieldDescriptor.type() == FieldDescriptor::TYPE_MESSAGE) {
  1276. if (fieldDescriptor.is_map()) {
  1277. return GetMapType(fieldDescriptor, fieldOptions);
  1278. } else {
  1279. return withFieldLabel(GetMessageType(fieldDescriptor, TProtobufFieldOptions{}));
  1280. }
  1281. } else {
  1282. return withFieldLabel(getScalarType());
  1283. }
  1284. case EProtobufSerializationMode::Embedded:
  1285. ythrow yexception() << "EMBEDDED field is not allowed for field "
  1286. << fieldDescriptor.full_name();
  1287. }
  1288. Y_ABORT();
  1289. }
  1290. TTableSchema TTableSchemaInferrer::InferSchema(const Descriptor& messageDescriptor)
  1291. {
  1292. TTableSchema result;
  1293. auto defaultFieldOptions = GetDefaultFieldOptions(&messageDescriptor);
  1294. auto members = GetMessageMembers(
  1295. messageDescriptor.full_name(),
  1296. messageDescriptor,
  1297. defaultFieldOptions,
  1298. // Use special sort order for top level messages.
  1299. /*overrideFieldSortOrder*/ EProtobufFieldSortOrder::AsInProtoFile);
  1300. for (auto& member : members) {
  1301. std::visit(TOverloaded{
  1302. [&] (TOtherColumns) {
  1303. result.Strict(false);
  1304. },
  1305. [&] (NTi::TTypePtr& type) {
  1306. result.AddColumn(TColumnSchema()
  1307. .Name(std::move(member.Name))
  1308. .Type(std::move(type))
  1309. );
  1310. },
  1311. }, member.TypeOrOtherColumns);
  1312. }
  1313. return result;
  1314. }
  1315. TTableSchema CreateTableSchemaImpl(
  1316. const Descriptor& messageDescriptor,
  1317. bool keepFieldsWithoutExtension)
  1318. {
  1319. TTableSchemaInferrer inferrer(keepFieldsWithoutExtension);
  1320. return inferrer.InferSchema(messageDescriptor);
  1321. }
  1322. ////////////////////////////////////////////////////////////////////////////////
  1323. } // namespace NYT::NDetail
  1324. ////////////////////////////////////////////////////////////////////////////////
  1325. template <>
  1326. void Out<NYT::EWrapperFieldFlag::Enum>(IOutputStream& stream, NYT::EWrapperFieldFlag::Enum value)
  1327. {
  1328. stream << NYT::EWrapperFieldFlag_Enum_Name(value);
  1329. }
  1330. template <>
  1331. void Out<NYT::EWrapperMessageFlag::Enum>(IOutputStream& stream, NYT::EWrapperMessageFlag::Enum value)
  1332. {
  1333. stream << NYT::EWrapperMessageFlag_Enum_Name(value);
  1334. }
  1335. template <>
  1336. void Out<NYT::EWrapperOneofFlag::Enum>(IOutputStream& stream, NYT::EWrapperOneofFlag::Enum value)
  1337. {
  1338. stream << NYT::EWrapperOneofFlag_Enum_Name(value);
  1339. }