prepare_operation.cpp 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #include "prepare_operation.h"
  2. #include <yt/cpp/mapreduce/common/retry_lib.h>
  3. #include <yt/cpp/mapreduce/http/retry_request.h>
  4. #include <yt/cpp/mapreduce/interface/raw_batch_request.h>
  5. #include <yt/cpp/mapreduce/interface/raw_client.h>
  6. #include <yt/cpp/mapreduce/interface/serialize.h>
  7. #include <library/cpp/iterator/functools.h>
  8. namespace NYT::NDetail {
  9. ////////////////////////////////////////////////////////////////////////////////
  10. TOperationPreparationContext::TOperationPreparationContext(
  11. const TStructuredJobTableList& structuredInputs,
  12. const TStructuredJobTableList& structuredOutputs,
  13. const IRawClientPtr& rawClient,
  14. const IClientRetryPolicyPtr& retryPolicy,
  15. TTransactionId transactionId)
  16. : RawClient_(rawClient)
  17. , RetryPolicy_(retryPolicy)
  18. , TransactionId_(transactionId)
  19. , InputSchemas_(structuredInputs.size())
  20. , InputSchemasLoaded_(structuredInputs.size(), false)
  21. {
  22. Inputs_.reserve(structuredInputs.size());
  23. for (const auto& input : structuredInputs) {
  24. Inputs_.push_back(input.RichYPath);
  25. }
  26. Outputs_.reserve(structuredOutputs.size());
  27. for (const auto& output : structuredOutputs) {
  28. Outputs_.push_back(output.RichYPath);
  29. }
  30. }
  31. TOperationPreparationContext::TOperationPreparationContext(
  32. TVector<TRichYPath> inputs,
  33. TVector<TRichYPath> outputs,
  34. const IRawClientPtr& rawClient,
  35. const IClientRetryPolicyPtr& retryPolicy,
  36. TTransactionId transactionId)
  37. : RawClient_(rawClient)
  38. , RetryPolicy_(retryPolicy)
  39. , TransactionId_(transactionId)
  40. , InputSchemas_(inputs.size())
  41. , InputSchemasLoaded_(inputs.size(), false)
  42. {
  43. Inputs_.reserve(inputs.size());
  44. for (auto& input : inputs) {
  45. Inputs_.push_back(std::move(input));
  46. }
  47. Outputs_.reserve(outputs.size());
  48. for (const auto& output : outputs) {
  49. Outputs_.push_back(std::move(output));
  50. }
  51. }
  52. int TOperationPreparationContext::GetInputCount() const
  53. {
  54. return static_cast<int>(Inputs_.size());
  55. }
  56. int TOperationPreparationContext::GetOutputCount() const
  57. {
  58. return static_cast<int>(Outputs_.size());
  59. }
  60. const TVector<TTableSchema>& TOperationPreparationContext::GetInputSchemas() const
  61. {
  62. TVector<::NThreading::TFuture<TNode>> schemaFutures;
  63. auto batch = RawClient_->CreateRawBatchRequest();
  64. for (int tableIndex = 0; tableIndex < static_cast<int>(InputSchemas_.size()); ++tableIndex) {
  65. if (InputSchemasLoaded_[tableIndex]) {
  66. schemaFutures.emplace_back();
  67. continue;
  68. }
  69. Y_ABORT_UNLESS(Inputs_[tableIndex]);
  70. schemaFutures.push_back(batch->Get(TransactionId_, Inputs_[tableIndex]->Path_ + "/@schema", TGetOptions{}));
  71. }
  72. batch->ExecuteBatch();
  73. for (int tableIndex = 0; tableIndex < static_cast<int>(InputSchemas_.size()); ++tableIndex) {
  74. if (schemaFutures[tableIndex].Initialized()) {
  75. Deserialize(InputSchemas_[tableIndex], schemaFutures[tableIndex].ExtractValueSync());
  76. }
  77. }
  78. return InputSchemas_;
  79. }
  80. const TTableSchema& TOperationPreparationContext::GetInputSchema(int index) const
  81. {
  82. auto& schema = InputSchemas_[index];
  83. if (!InputSchemasLoaded_[index]) {
  84. Y_ABORT_UNLESS(Inputs_[index]);
  85. auto schemaNode = RequestWithRetry<TNode>(
  86. RetryPolicy_->CreatePolicyForGenericRequest(),
  87. [this, &index] (TMutationId /*mutationId*/) {
  88. return RawClient_->Get(TransactionId_, Inputs_[index]->Path_ + "/@schema");
  89. });
  90. Deserialize(schema, schemaNode);
  91. }
  92. return schema;
  93. }
  94. TMaybe<TYPath> TOperationPreparationContext::GetInputPath(int index) const
  95. {
  96. Y_ABORT_UNLESS(index < static_cast<int>(Inputs_.size()));
  97. if (Inputs_[index]) {
  98. return Inputs_[index]->Path_;
  99. }
  100. return Nothing();
  101. }
  102. TMaybe<TYPath> TOperationPreparationContext::GetOutputPath(int index) const
  103. {
  104. Y_ABORT_UNLESS(index < static_cast<int>(Outputs_.size()));
  105. if (Outputs_[index]) {
  106. return Outputs_[index]->Path_;
  107. }
  108. return Nothing();
  109. }
  110. ////////////////////////////////////////////////////////////////////////////////
  111. TSpeculativeOperationPreparationContext::TSpeculativeOperationPreparationContext(
  112. const TVector<TTableSchema>& previousResult,
  113. TStructuredJobTableList inputs,
  114. TStructuredJobTableList outputs)
  115. : InputSchemas_(previousResult)
  116. , Inputs_(std::move(inputs))
  117. , Outputs_(std::move(outputs))
  118. {
  119. Y_ABORT_UNLESS(Inputs_.size() == previousResult.size());
  120. }
  121. int TSpeculativeOperationPreparationContext::GetInputCount() const
  122. {
  123. return static_cast<int>(Inputs_.size());
  124. }
  125. int TSpeculativeOperationPreparationContext::GetOutputCount() const
  126. {
  127. return static_cast<int>(Outputs_.size());
  128. }
  129. const TVector<TTableSchema>& TSpeculativeOperationPreparationContext::GetInputSchemas() const
  130. {
  131. return InputSchemas_;
  132. }
  133. const TTableSchema& TSpeculativeOperationPreparationContext::GetInputSchema(int index) const
  134. {
  135. Y_ABORT_UNLESS(index < static_cast<int>(InputSchemas_.size()));
  136. return InputSchemas_[index];
  137. }
  138. TMaybe<TYPath> TSpeculativeOperationPreparationContext::GetInputPath(int index) const
  139. {
  140. Y_ABORT_UNLESS(index < static_cast<int>(Inputs_.size()));
  141. if (Inputs_[index].RichYPath) {
  142. return Inputs_[index].RichYPath->Path_;
  143. }
  144. return Nothing();
  145. }
  146. TMaybe<TYPath> TSpeculativeOperationPreparationContext::GetOutputPath(int index) const
  147. {
  148. Y_ABORT_UNLESS(index < static_cast<int>(Outputs_.size()));
  149. if (Outputs_[index].RichYPath) {
  150. return Outputs_[index].RichYPath->Path_;
  151. }
  152. return Nothing();
  153. }
  154. ////////////////////////////////////////////////////////////////////////////////
  155. static void FixInputTable(TRichYPath& table, int index, const TJobOperationPreparer& preparer)
  156. {
  157. const auto& columnRenamings = preparer.GetInputColumnRenamings();
  158. const auto& columnFilters = preparer.GetInputColumnFilters();
  159. if (!columnRenamings[index].empty()) {
  160. table.RenameColumns(columnRenamings[index]);
  161. }
  162. if (columnFilters[index]) {
  163. table.Columns(*columnFilters[index]);
  164. }
  165. }
  166. static void FixInputTable(TStructuredJobTable& table, int index, const TJobOperationPreparer& preparer)
  167. {
  168. const auto& inputDescriptions = preparer.GetInputDescriptions();
  169. if (inputDescriptions[index] && std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) {
  170. table.Description = *inputDescriptions[index];
  171. }
  172. if (table.RichYPath) {
  173. FixInputTable(*table.RichYPath, index, preparer);
  174. }
  175. }
  176. static void FixOutputTable(TRichYPath& /* table */, int /* index */, const TJobOperationPreparer& /* preparer */)
  177. { }
  178. static void FixOutputTable(TStructuredJobTable& table, int index, const TJobOperationPreparer& preparer)
  179. {
  180. const auto& outputDescriptions = preparer.GetOutputDescriptions();
  181. if (outputDescriptions[index] && std::holds_alternative<TUnspecifiedTableStructure>(table.Description)) {
  182. table.Description = *outputDescriptions[index];
  183. }
  184. if (table.RichYPath) {
  185. FixOutputTable(*table.RichYPath, index, preparer);
  186. }
  187. }
  188. template <typename TTables>
  189. TVector<TTableSchema> PrepareOperation(
  190. const IJob& job,
  191. const IOperationPreparationContext& context,
  192. TTables* inputsPtr,
  193. TTables* outputsPtr,
  194. TUserJobFormatHints& hints)
  195. {
  196. TJobOperationPreparer preparer(context);
  197. job.PrepareOperation(context, preparer);
  198. preparer.Finish();
  199. if (inputsPtr) {
  200. auto& inputs = *inputsPtr;
  201. for (int i = 0; i < static_cast<int>(inputs.size()); ++i) {
  202. FixInputTable(inputs[i], i, preparer);
  203. }
  204. }
  205. if (outputsPtr) {
  206. auto& outputs = *outputsPtr;
  207. for (int i = 0; i < static_cast<int>(outputs.size()); ++i) {
  208. FixOutputTable(outputs[i], i, preparer);
  209. }
  210. }
  211. auto applyPatch = [](TMaybe<TFormatHints>& origin, const TMaybe<TFormatHints>& patch) {
  212. if (origin) {
  213. if (patch) {
  214. origin->Merge(*patch);
  215. }
  216. } else {
  217. origin = patch;
  218. }
  219. };
  220. auto preparerHints = preparer.GetFormatHints();
  221. applyPatch(preparerHints.InputFormatHints_, hints.InputFormatHints_);
  222. applyPatch(preparerHints.OutputFormatHints_, hints.OutputFormatHints_);
  223. hints = std::move(preparerHints);
  224. return preparer.GetOutputSchemas();
  225. }
  226. template
  227. TVector<TTableSchema> PrepareOperation<TStructuredJobTableList>(
  228. const IJob& job,
  229. const IOperationPreparationContext& context,
  230. TStructuredJobTableList* inputsPtr,
  231. TStructuredJobTableList* outputsPtr,
  232. TUserJobFormatHints& hints);
  233. template
  234. TVector<TTableSchema> PrepareOperation<TVector<TRichYPath>>(
  235. const IJob& job,
  236. const IOperationPreparationContext& context,
  237. TVector<TRichYPath>* inputsPtr,
  238. TVector<TRichYPath>* outputsPtr,
  239. TUserJobFormatHints& hints);
  240. ////////////////////////////////////////////////////////////////////////////////
  241. } // namespace NYT::NDetail