structured_table_formats.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. #include "structured_table_formats.h"
  2. #include "format_hints.h"
  3. #include "skiff.h"
  4. #include <yt/cpp/mapreduce/common/retry_lib.h>
  5. #include <yt/cpp/mapreduce/io/yamr_table_reader.h>
  6. #include <yt/cpp/mapreduce/library/table_schema/protobuf.h>
  7. #include <yt/cpp/mapreduce/interface/common.h>
  8. #include <yt/cpp/mapreduce/raw_client/raw_requests.h>
  9. #include <library/cpp/type_info/type_info.h>
  10. #include <library/cpp/yson/writer.h>
  11. #include <memory>
  12. namespace NYT {
  13. ////////////////////////////////////////////////////////////////////////////////
  14. TMaybe<TNode> GetCommonTableFormat(
  15. const TVector<TMaybe<TNode>>& formats)
  16. {
  17. TMaybe<TNode> result;
  18. bool start = true;
  19. for (auto& format : formats) {
  20. if (start) {
  21. result = format;
  22. start = false;
  23. continue;
  24. }
  25. if (result.Defined() != format.Defined()) {
  26. ythrow yexception() << "Different formats of input tables";
  27. }
  28. if (!result.Defined()) {
  29. continue;
  30. }
  31. auto& resultAttrs = result.Get()->GetAttributes();
  32. auto& formatAttrs = format.Get()->GetAttributes();
  33. if (resultAttrs["key_column_names"] != formatAttrs["key_column_names"]) {
  34. ythrow yexception() << "Different formats of input tables";
  35. }
  36. bool hasSubkeyColumns = resultAttrs.HasKey("subkey_column_names");
  37. if (hasSubkeyColumns != formatAttrs.HasKey("subkey_column_names")) {
  38. ythrow yexception() << "Different formats of input tables";
  39. }
  40. if (hasSubkeyColumns &&
  41. resultAttrs["subkey_column_names"] != formatAttrs["subkey_column_names"])
  42. {
  43. ythrow yexception() << "Different formats of input tables";
  44. }
  45. }
  46. return result;
  47. }
  48. TMaybe<TNode> GetTableFormat(
  49. const IClientRetryPolicyPtr& retryPolicy,
  50. const TClientContext& context,
  51. const TTransactionId& transactionId,
  52. const TRichYPath& path)
  53. {
  54. auto formatPath = path.Path_ + "/@_format";
  55. if (!NDetail::NRawClient::Exists(retryPolicy->CreatePolicyForGenericRequest(), context, transactionId, formatPath)) {
  56. return TMaybe<TNode>();
  57. }
  58. TMaybe<TNode> format = NDetail::NRawClient::Get(retryPolicy->CreatePolicyForGenericRequest(), context, transactionId, formatPath);
  59. if (format.Get()->AsString() != "yamred_dsv") {
  60. return TMaybe<TNode>();
  61. }
  62. auto& formatAttrs = format.Get()->Attributes();
  63. if (!formatAttrs.HasKey("key_column_names")) {
  64. ythrow yexception() <<
  65. "Table '" << path.Path_ << "': attribute 'key_column_names' is missing";
  66. }
  67. formatAttrs["has_subkey"] = "true";
  68. formatAttrs["lenval"] = "true";
  69. return format;
  70. }
  71. TMaybe<TNode> GetTableFormats(
  72. const IClientRetryPolicyPtr& clientRetryPolicy,
  73. const TClientContext& context,
  74. const TTransactionId& transactionId,
  75. const TVector<TRichYPath>& inputs)
  76. {
  77. TVector<TMaybe<TNode>> formats;
  78. for (auto& table : inputs) {
  79. formats.push_back(GetTableFormat(clientRetryPolicy, context, transactionId, table));
  80. }
  81. return GetCommonTableFormat(formats);
  82. }
  83. ////////////////////////////////////////////////////////////////////////////////
  84. namespace NDetail {
  85. ////////////////////////////////////////////////////////////////////////////////
  86. NSkiff::TSkiffSchemaPtr TryCreateSkiffSchema(
  87. const TClientContext& context,
  88. const IClientRetryPolicyPtr& clientRetryPolicy,
  89. const TTransactionId& transactionId,
  90. const TVector<TRichYPath>& tables,
  91. const TOperationOptions& options,
  92. ENodeReaderFormat nodeReaderFormat)
  93. {
  94. bool hasInputQuery = options.Spec_.Defined() && options.Spec_->IsMap() && options.Spec_->HasKey("input_query");
  95. if (hasInputQuery) {
  96. Y_ENSURE_EX(nodeReaderFormat != ENodeReaderFormat::Skiff,
  97. TApiUsageError() << "Cannot use Skiff format for operations with 'input_query' in spec");
  98. return nullptr;
  99. }
  100. return CreateSkiffSchemaIfNecessary(
  101. context,
  102. clientRetryPolicy,
  103. transactionId,
  104. nodeReaderFormat,
  105. tables,
  106. TCreateSkiffSchemaOptions()
  107. .HasKeySwitch(true)
  108. .HasRangeIndex(true));
  109. }
  110. TString CreateSkiffConfig(const NSkiff::TSkiffSchemaPtr& schema)
  111. {
  112. TString result;
  113. TStringOutput stream(result);
  114. ::NYson::TYsonWriter writer(&stream);
  115. Serialize(schema, &writer);
  116. return result;
  117. }
  118. TString CreateProtoConfig(const TVector<const ::google::protobuf::Descriptor*>& descriptorList)
  119. {
  120. TString result;
  121. TStringOutput messageTypeList(result);
  122. for (const auto& descriptor : descriptorList) {
  123. messageTypeList << descriptor->full_name() << Endl;
  124. }
  125. return result;
  126. }
  127. ////////////////////////////////////////////////////////////////////////////////
  128. struct TGetTableStructureDescriptionStringImpl {
  129. template<typename T>
  130. TString operator()(const T& description) {
  131. if constexpr (std::is_same_v<T, TUnspecifiedTableStructure>) {
  132. return "Unspecified";
  133. } else if constexpr (std::is_same_v<T, TProtobufTableStructure>) {
  134. TString res;
  135. TStringStream out(res);
  136. if (description.Descriptor) {
  137. out << description.Descriptor->full_name();
  138. } else {
  139. out << "<unknown>";
  140. }
  141. out << " protobuf message";
  142. return res;
  143. } else {
  144. static_assert(TDependentFalse<T>, "Unknown type");
  145. }
  146. }
  147. };
  148. TString GetTableStructureDescriptionString(const TTableStructure& tableStructure)
  149. {
  150. return std::visit(TGetTableStructureDescriptionStringImpl(), tableStructure);
  151. }
  152. ////////////////////////////////////////////////////////////////////////////////
  153. TString JobTablePathString(const TStructuredJobTable& jobTable)
  154. {
  155. if (jobTable.RichYPath) {
  156. return jobTable.RichYPath->Path_;
  157. } else {
  158. return "<intermediate-table>";
  159. }
  160. }
  161. TStructuredJobTableList ToStructuredJobTableList(const TVector<TStructuredTablePath>& tableList)
  162. {
  163. TStructuredJobTableList result;
  164. for (const auto& table : tableList) {
  165. result.push_back(TStructuredJobTable{table.Description, table.RichYPath});
  166. }
  167. return result;
  168. }
  169. TStructuredJobTableList CanonizeStructuredTableList(const TClientContext& context, const TVector<TStructuredTablePath>& tableList)
  170. {
  171. TVector<TRichYPath> toCanonize;
  172. toCanonize.reserve(tableList.size());
  173. for (const auto& table : tableList) {
  174. toCanonize.emplace_back(table.RichYPath);
  175. }
  176. const auto canonized = NRawClient::CanonizeYPaths(/* retryPolicy */ nullptr, context, toCanonize);
  177. Y_ABORT_UNLESS(canonized.size() == tableList.size());
  178. TStructuredJobTableList result;
  179. result.reserve(tableList.size());
  180. for (size_t i = 0; i != canonized.size(); ++i) {
  181. result.emplace_back(TStructuredJobTable{tableList[i].Description, canonized[i]});
  182. }
  183. return result;
  184. }
  185. TVector<TRichYPath> GetPathList(
  186. const TStructuredJobTableList& tableList,
  187. const TMaybe<TVector<TTableSchema>>& jobSchemaInferenceResult,
  188. bool inferSchemaFromDescriptions)
  189. {
  190. Y_ABORT_UNLESS(!jobSchemaInferenceResult || tableList.size() == jobSchemaInferenceResult->size());
  191. auto maybeInferSchema = [&] (const TStructuredJobTable& table, ui32 tableIndex) -> TMaybe<TTableSchema> {
  192. if (jobSchemaInferenceResult && !jobSchemaInferenceResult->at(tableIndex).Empty()) {
  193. return jobSchemaInferenceResult->at(tableIndex);
  194. }
  195. if (inferSchemaFromDescriptions) {
  196. return GetTableSchema(table.Description);
  197. }
  198. return Nothing();
  199. };
  200. TVector<TRichYPath> result;
  201. result.reserve(tableList.size());
  202. for (size_t tableIndex = 0; tableIndex != tableList.size(); ++tableIndex) {
  203. const auto& table = tableList[tableIndex];
  204. Y_ABORT_UNLESS(table.RichYPath, "Cannot get path for intermediate table");
  205. auto richYPath = *table.RichYPath;
  206. if (!richYPath.Schema_) {
  207. if (auto schema = maybeInferSchema(table, tableIndex)) {
  208. richYPath.Schema(std::move(*schema));
  209. }
  210. }
  211. result.emplace_back(std::move(richYPath));
  212. }
  213. return result;
  214. }
  215. TStructuredRowStreamDescription GetJobStreamDescription(
  216. const IStructuredJob& job,
  217. EIODirection direction)
  218. {
  219. switch (direction) {
  220. case EIODirection::Input:
  221. return job.GetInputRowStreamDescription();
  222. case EIODirection::Output:
  223. return job.GetOutputRowStreamDescription();
  224. default:
  225. Y_ABORT("unreachable");
  226. }
  227. }
  228. TString GetSuffix(EIODirection direction)
  229. {
  230. switch (direction) {
  231. case EIODirection::Input:
  232. return "_input";
  233. case EIODirection::Output:
  234. return "_output";
  235. }
  236. Y_ABORT("unreachable");
  237. }
  238. TString GetAddIOMethodName(EIODirection direction)
  239. {
  240. switch (direction) {
  241. case EIODirection::Input:
  242. return "AddInput<>";
  243. case EIODirection::Output:
  244. return "AddOutput<>";
  245. }
  246. Y_ABORT("unreachable");
  247. }
  248. ////////////////////////////////////////////////////////////////////////////////
  249. struct TFormatBuilder::TFormatSwitcher
  250. {
  251. template <typename T>
  252. auto operator() (const T& /*t*/) {
  253. if constexpr (std::is_same_v<T, TTNodeStructuredRowStream>) {
  254. return &TFormatBuilder::CreateNodeFormat;
  255. } else if constexpr (std::is_same_v<T, TTYaMRRowStructuredRowStream>) {
  256. return &TFormatBuilder::CreateYamrFormat;
  257. } else if constexpr (std::is_same_v<T, TProtobufStructuredRowStream>) {
  258. return &TFormatBuilder::CreateProtobufFormat;
  259. } else if constexpr (std::is_same_v<T, TVoidStructuredRowStream>) {
  260. return &TFormatBuilder::CreateVoidFormat;
  261. } else {
  262. static_assert(TDependentFalse<T>, "unknown stream description");
  263. }
  264. }
  265. };
  266. TFormatBuilder::TFormatBuilder(
  267. IClientRetryPolicyPtr clientRetryPolicy,
  268. TClientContext context,
  269. TTransactionId transactionId,
  270. TOperationOptions operationOptions)
  271. : ClientRetryPolicy_(std::move(clientRetryPolicy))
  272. , Context_(std::move(context))
  273. , TransactionId_(transactionId)
  274. , OperationOptions_(std::move(operationOptions))
  275. { }
  276. std::pair <TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateFormat(
  277. const IStructuredJob& job,
  278. const EIODirection& direction,
  279. const TStructuredJobTableList& structuredTableList,
  280. const TMaybe <TFormatHints>& formatHints,
  281. ENodeReaderFormat nodeReaderFormat,
  282. bool allowFormatFromTableAttribute)
  283. {
  284. auto jobStreamDescription = GetJobStreamDescription(job, direction);
  285. auto method = std::visit(TFormatSwitcher(), jobStreamDescription);
  286. return (this->*method)(
  287. job,
  288. direction,
  289. structuredTableList,
  290. formatHints,
  291. nodeReaderFormat,
  292. allowFormatFromTableAttribute);
  293. }
  294. std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateVoidFormat(
  295. const IStructuredJob& /*job*/,
  296. const EIODirection& /*direction*/,
  297. const TStructuredJobTableList& /*structuredTableList*/,
  298. const TMaybe<TFormatHints>& /*formatHints*/,
  299. ENodeReaderFormat /*nodeReaderFormat*/,
  300. bool /*allowFormatFromTableAttribute*/)
  301. {
  302. return {
  303. TFormat(),
  304. Nothing()
  305. };
  306. }
  307. std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateYamrFormat(
  308. const IStructuredJob& job,
  309. const EIODirection& direction,
  310. const TStructuredJobTableList& structuredTableList,
  311. const TMaybe<TFormatHints>& /*formatHints*/,
  312. ENodeReaderFormat /*nodeReaderFormat*/,
  313. bool allowFormatFromTableAttribute)
  314. {
  315. for (const auto& table: structuredTableList) {
  316. if (!std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) {
  317. ythrow TApiUsageError()
  318. << "cannot use " << direction << " table '" << JobTablePathString(table)
  319. << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; "
  320. << "table has unsupported structure description; check " << GetAddIOMethodName(direction) << " for this table";
  321. }
  322. }
  323. TMaybe<TNode> formatFromTableAttributes;
  324. if (allowFormatFromTableAttribute && OperationOptions_.UseTableFormats_) {
  325. TVector<TRichYPath> tableList;
  326. for (const auto& table: structuredTableList) {
  327. Y_ABORT_UNLESS(table.RichYPath, "Cannot use format from table for intermediate table");
  328. tableList.push_back(*table.RichYPath);
  329. }
  330. formatFromTableAttributes = GetTableFormats(ClientRetryPolicy_, Context_, TransactionId_, tableList);
  331. }
  332. if (formatFromTableAttributes) {
  333. return {
  334. TFormat(*formatFromTableAttributes),
  335. Nothing()
  336. };
  337. } else {
  338. auto formatNode = TNode("yamr");
  339. formatNode.Attributes() = TNode()
  340. ("lenval", true)
  341. ("has_subkey", true)
  342. ("enable_table_index", true);
  343. return {
  344. TFormat(formatNode),
  345. Nothing()
  346. };
  347. }
  348. }
  349. std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateNodeFormat(
  350. const IStructuredJob& job,
  351. const EIODirection& direction,
  352. const TStructuredJobTableList& structuredTableList,
  353. const TMaybe<TFormatHints>& formatHints,
  354. ENodeReaderFormat nodeReaderFormat,
  355. bool /*allowFormatFromTableAttribute*/)
  356. {
  357. for (const auto& table: structuredTableList) {
  358. if (!std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) {
  359. ythrow TApiUsageError()
  360. << "cannot use " << direction << " table '" << JobTablePathString(table)
  361. << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; "
  362. << "table has unsupported structure description; check AddInput<> / AddOutput<> for this table";
  363. }
  364. }
  365. NSkiff::TSkiffSchemaPtr skiffSchema = nullptr;
  366. if (nodeReaderFormat != ENodeReaderFormat::Yson) {
  367. TVector<TRichYPath> tableList;
  368. for (const auto& table: structuredTableList) {
  369. Y_ABORT_UNLESS(table.RichYPath, "Cannot use skiff with temporary tables");
  370. tableList.emplace_back(*table.RichYPath);
  371. }
  372. skiffSchema = TryCreateSkiffSchema(
  373. Context_,
  374. ClientRetryPolicy_,
  375. TransactionId_,
  376. tableList,
  377. OperationOptions_,
  378. nodeReaderFormat);
  379. }
  380. if (skiffSchema) {
  381. auto format = CreateSkiffFormat(skiffSchema);
  382. NYT::NDetail::ApplyFormatHints<TNode>(&format, formatHints);
  383. return {
  384. format,
  385. TSmallJobFile{
  386. TString("skiff") + GetSuffix(direction),
  387. CreateSkiffConfig(skiffSchema)
  388. }
  389. };
  390. } else {
  391. auto format = TFormat::YsonBinary();
  392. NYT::NDetail::ApplyFormatHints<TNode>(&format, formatHints);
  393. return {
  394. format,
  395. Nothing()
  396. };
  397. }
  398. }
  399. [[noreturn]] static void ThrowUnsupportedStructureDescription(
  400. const EIODirection& direction,
  401. const TStructuredJobTable& table,
  402. const IStructuredJob& job)
  403. {
  404. ythrow TApiUsageError()
  405. << "cannot use " << direction << " table '" << JobTablePathString(table)
  406. << "' with job " << TJobFactory::Get()->GetJobName(&job) << "; "
  407. << "table has unsupported structure description; check " << GetAddIOMethodName(direction) << " for this table";
  408. }
  409. [[noreturn]] static void ThrowTypeDeriveFail(
  410. const EIODirection& direction,
  411. const IStructuredJob& job,
  412. const TString& type)
  413. {
  414. ythrow TApiUsageError()
  415. << "Cannot derive exact " << type << " type for intermediate " << direction << " table for job "
  416. << TJobFactory::Get()->GetJobName(&job)
  417. << "; use one of TMapReduceOperationSpec::Hint* methods to specify intermediate table structure";
  418. }
  419. [[noreturn]] static void ThrowUnexpectedDifferentDescriptors(
  420. const EIODirection& direction,
  421. const TStructuredJobTable& table,
  422. const IStructuredJob& job,
  423. const TMaybe<TStringBuf> jobDescriptorName,
  424. const TMaybe<TStringBuf> descriptorName)
  425. {
  426. ythrow TApiUsageError()
  427. << "Job " << TJobFactory::Get()->GetJobName(&job) << " expects "
  428. << jobDescriptorName << " as " << direction << ", but table " << JobTablePathString(table)
  429. << " is tagged with " << descriptorName;
  430. }
  431. std::pair<TFormat, TMaybe<TSmallJobFile>> TFormatBuilder::CreateProtobufFormat(
  432. const IStructuredJob& job,
  433. const EIODirection& direction,
  434. const TStructuredJobTableList& structuredTableList,
  435. const TMaybe<TFormatHints>& /*formatHints*/,
  436. ENodeReaderFormat /*nodeReaderFormat*/,
  437. bool /*allowFormatFromTableAttribute*/)
  438. {
  439. if (Context_.Config->UseClientProtobuf) {
  440. return {
  441. TFormat::YsonBinary(),
  442. TSmallJobFile{
  443. TString("proto") + GetSuffix(direction),
  444. CreateProtoConfig({}),
  445. },
  446. };
  447. }
  448. const ::google::protobuf::Descriptor* const jobDescriptor =
  449. std::get<TProtobufStructuredRowStream>(GetJobStreamDescription(job, direction)).Descriptor;
  450. Y_ENSURE(!structuredTableList.empty(),
  451. "empty " << direction << " tables for job " << TJobFactory::Get()->GetJobName(&job));
  452. TVector<const ::google::protobuf::Descriptor*> descriptorList;
  453. for (const auto& table : structuredTableList) {
  454. const ::google::protobuf::Descriptor* descriptor = nullptr;
  455. if (std::holds_alternative<TProtobufTableStructure>(table.Description)) {
  456. descriptor = std::get<TProtobufTableStructure>(table.Description).Descriptor;
  457. } else if (table.RichYPath) {
  458. ThrowUnsupportedStructureDescription(direction, table, job);
  459. }
  460. if (!descriptor) {
  461. // It must be intermediate table, because there is no proper way to add such table to spec
  462. // (AddInput requires to specify proper message).
  463. Y_ABORT_UNLESS(!table.RichYPath, "Descriptors for all tables except intermediate must be known");
  464. if (jobDescriptor) {
  465. descriptor = jobDescriptor;
  466. } else {
  467. ThrowTypeDeriveFail(direction, job, "protobuf");
  468. }
  469. }
  470. if (jobDescriptor && descriptor != jobDescriptor) {
  471. ThrowUnexpectedDifferentDescriptors(
  472. direction,
  473. table,
  474. job,
  475. jobDescriptor->full_name(),
  476. descriptor->full_name());
  477. }
  478. descriptorList.push_back(descriptor);
  479. }
  480. Y_ABORT_UNLESS(!descriptorList.empty(), "Messages for proto format are unknown (empty ProtoDescriptors)");
  481. return {
  482. TFormat::Protobuf(descriptorList, Context_.Config->ProtobufFormatWithDescriptors),
  483. TSmallJobFile{
  484. TString("proto") + GetSuffix(direction),
  485. CreateProtoConfig(descriptorList)
  486. },
  487. };
  488. }
  489. ////////////////////////////////////////////////////////////////////////////////
  490. struct TGetTableSchemaImpl
  491. {
  492. template <typename T>
  493. TMaybe<TTableSchema> operator() (const T& description) {
  494. if constexpr (std::is_same_v<T, TUnspecifiedTableStructure>) {
  495. return Nothing();
  496. } else if constexpr (std::is_same_v<T, TProtobufTableStructure>) {
  497. if (!description.Descriptor) {
  498. return Nothing();
  499. }
  500. return CreateTableSchema(*description.Descriptor);
  501. } else {
  502. static_assert(TDependentFalse<T>, "unknown type");
  503. }
  504. }
  505. };
  506. TMaybe<TTableSchema> GetTableSchema(const TTableStructure& tableStructure)
  507. {
  508. return std::visit(TGetTableSchemaImpl(), tableStructure);
  509. }
  510. ////////////////////////////////////////////////////////////////////////////////
  511. } // namespace NDetail
  512. } // namespace NYT