operation_preparer.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. #include "operation_preparer.h"
  2. #include "init.h"
  3. #include "file_writer.h"
  4. #include "operation.h"
  5. #include "operation_helpers.h"
  6. #include "operation_tracker.h"
  7. #include "transaction.h"
  8. #include "transaction_pinger.h"
  9. #include "yt_poller.h"
  10. #include <yt/cpp/mapreduce/common/helpers.h>
  11. #include <yt/cpp/mapreduce/common/retry_lib.h>
  12. #include <yt/cpp/mapreduce/common/wait_proxy.h>
  13. #include <yt/cpp/mapreduce/http_client/raw_requests.h>
  14. #include <yt/cpp/mapreduce/interface/error_codes.h>
  15. #include <yt/cpp/mapreduce/interface/raw_client.h>
  16. #include <yt/cpp/mapreduce/interface/logging/yt_log.h>
  17. #include <library/cpp/digest/md5/md5.h>
  18. #include <util/folder/path.h>
  19. #include <util/string/builder.h>
  20. #include <util/system/execpath.h>
  21. namespace NYT::NDetail {
  22. ////////////////////////////////////////////////////////////////////////////////
  23. class TWaitOperationStartPollerItem
  24. : public IYtPollerItem
  25. {
  26. public:
  27. TWaitOperationStartPollerItem(TOperationId operationId, std::unique_ptr<TPingableTransaction> transaction)
  28. : OperationId_(operationId)
  29. , Transaction_(std::move(transaction))
  30. { }
  31. void PrepareRequest(IRawBatchRequest* batchRequest) override
  32. {
  33. Future_ = batchRequest->GetOperation(
  34. OperationId_,
  35. TGetOperationOptions().AttributeFilter(
  36. TOperationAttributeFilter().Add(EOperationAttribute::State)));
  37. }
  38. EStatus OnRequestExecuted() override
  39. {
  40. try {
  41. auto attributes = Future_.GetValue();
  42. Y_ENSURE(attributes.State.Defined());
  43. bool operationHasLockedFiles =
  44. *attributes.State != "starting" &&
  45. *attributes.State != "pending" &&
  46. *attributes.State != "orphaned" &&
  47. *attributes.State != "waiting_for_agent" &&
  48. *attributes.State != "initializing";
  49. return operationHasLockedFiles ? EStatus::PollBreak : EStatus::PollContinue;
  50. } catch (const TErrorResponse& e) {
  51. YT_LOG_ERROR("get_operation request failed: %v (RequestId: %v)",
  52. e.GetError().GetMessage(),
  53. e.GetRequestId());
  54. return IsRetriable(e) ? PollContinue : PollBreak;
  55. } catch (const std::exception& e) {
  56. YT_LOG_ERROR("%v", e.what());
  57. return PollBreak;
  58. }
  59. }
  60. void OnItemDiscarded() override {
  61. }
  62. private:
  63. TOperationId OperationId_;
  64. std::unique_ptr<TPingableTransaction> Transaction_;
  65. ::NThreading::TFuture<TOperationAttributes> Future_;
  66. };
  67. ////////////////////////////////////////////////////////////////////////////////
  68. class TOperationForwardingRequestRetryPolicy
  69. : public IRequestRetryPolicy
  70. {
  71. public:
  72. TOperationForwardingRequestRetryPolicy(const IRequestRetryPolicyPtr& underlying, const TOperationPtr& operation)
  73. : Underlying_(underlying)
  74. , Operation_(operation)
  75. { }
  76. void NotifyNewAttempt() override
  77. {
  78. Underlying_->NotifyNewAttempt();
  79. }
  80. TMaybe<TDuration> OnGenericError(const std::exception& e) override
  81. {
  82. UpdateOperationStatus(e.what());
  83. return Underlying_->OnGenericError(e);
  84. }
  85. TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override
  86. {
  87. auto msg = e.GetError().ShortDescription();
  88. UpdateOperationStatus(msg);
  89. return Underlying_->OnRetriableError(e);
  90. }
  91. void OnIgnoredError(const TErrorResponse& e) override
  92. {
  93. Underlying_->OnIgnoredError(e);
  94. }
  95. TString GetAttemptDescription() const override
  96. {
  97. return Underlying_->GetAttemptDescription();
  98. }
  99. private:
  100. void UpdateOperationStatus(TStringBuf err)
  101. {
  102. Y_ABORT_UNLESS(Operation_);
  103. Operation_->OnStatusUpdated(
  104. ::TStringBuilder() << "Retriable error during operation start: " << err);
  105. }
  106. private:
  107. IRequestRetryPolicyPtr Underlying_;
  108. TOperationPtr Operation_;
  109. };
  110. ////////////////////////////////////////////////////////////////////////////////
  111. TOperationPreparer::TOperationPreparer(TClientPtr client, TTransactionId transactionId)
  112. : Client_(std::move(client))
  113. , TransactionId_(transactionId)
  114. , FileTransaction_(std::make_unique<TPingableTransaction>(
  115. Client_->GetRawClient(),
  116. Client_->GetRetryPolicy(),
  117. Client_->GetContext(),
  118. TransactionId_,
  119. Client_->GetTransactionPinger()->GetChildTxPinger(),
  120. TStartTransactionOptions()))
  121. , ClientRetryPolicy_(Client_->GetRetryPolicy())
  122. , PreparationId_(CreateGuidAsString())
  123. { }
  124. const TClientContext& TOperationPreparer::GetContext() const
  125. {
  126. return Client_->GetContext();
  127. }
  128. TTransactionId TOperationPreparer::GetTransactionId() const
  129. {
  130. return TransactionId_;
  131. }
  132. TClientPtr TOperationPreparer::GetClient() const
  133. {
  134. return Client_;
  135. }
  136. const TString& TOperationPreparer::GetPreparationId() const
  137. {
  138. return PreparationId_;
  139. }
  140. const IClientRetryPolicyPtr& TOperationPreparer::GetClientRetryPolicy() const
  141. {
  142. return ClientRetryPolicy_;
  143. }
  144. TOperationId TOperationPreparer::StartOperation(
  145. TOperation* operation,
  146. EOperationType type,
  147. const TNode& spec)
  148. {
  149. CheckValidity();
  150. auto operationId = RequestWithRetry<TOperationId>(
  151. ::MakeIntrusive<TOperationForwardingRequestRetryPolicy>(
  152. ClientRetryPolicy_->CreatePolicyForStartOperationRequest(),
  153. TOperationPtr(operation)),
  154. [this, &type, &spec] (TMutationId& mutationId) {
  155. return Client_->GetRawClient()->StartOperation(mutationId, TransactionId_, type, spec);
  156. });
  157. YT_LOG_DEBUG("Operation started (OperationId: %v; PreparationId: %v)",
  158. operationId,
  159. GetPreparationId());
  160. YT_LOG_INFO("Operation %v started (%v): %v",
  161. operationId,
  162. type,
  163. GetOperationWebInterfaceUrl(GetContext().ServerName, operationId));
  164. TOperationExecutionTimeTracker::Get()->Start(operationId);
  165. Client_->GetYtPoller().Watch(
  166. new TWaitOperationStartPollerItem(operationId, std::move(FileTransaction_)));
  167. return operationId;
  168. }
  169. void TOperationPreparer::LockFiles(TVector<TRichYPath>* paths)
  170. {
  171. CheckValidity();
  172. TVector<::NThreading::TFuture<TLockId>> lockIdFutures;
  173. lockIdFutures.reserve(paths->size());
  174. auto lockRequest = Client_->GetRawClient()->CreateRawBatchRequest();
  175. for (const auto& path : *paths) {
  176. lockIdFutures.push_back(lockRequest->Lock(
  177. FileTransaction_->GetId(),
  178. path.Path_,
  179. ELockMode::LM_SNAPSHOT,
  180. TLockOptions().Waitable(true)));
  181. }
  182. lockRequest->ExecuteBatch();
  183. TVector<::NThreading::TFuture<TNode>> nodeIdFutures;
  184. nodeIdFutures.reserve(paths->size());
  185. auto getNodeIdRequest = Client_->GetRawClient()->CreateRawBatchRequest();
  186. for (const auto& lockIdFuture : lockIdFutures) {
  187. nodeIdFutures.push_back(getNodeIdRequest->Get(
  188. FileTransaction_->GetId(),
  189. ::TStringBuilder() << '#' << GetGuidAsString(lockIdFuture.GetValue()) << "/@node_id",
  190. TGetOptions()));
  191. }
  192. getNodeIdRequest->ExecuteBatch();
  193. for (size_t i = 0; i != paths->size(); ++i) {
  194. auto& richPath = (*paths)[i];
  195. richPath.OriginalPath(richPath.Path_);
  196. richPath.Path("#" + nodeIdFutures[i].GetValue().AsString());
  197. YT_LOG_DEBUG("Locked file %v, new path is %v",
  198. *richPath.OriginalPath_,
  199. richPath.Path_);
  200. }
  201. }
  202. void TOperationPreparer::CheckValidity() const
  203. {
  204. Y_ENSURE(
  205. FileTransaction_,
  206. "File transaction is already moved, are you trying to use preparer for more than one operation?");
  207. }
  208. ////////////////////////////////////////////////////////////////////////////////
  209. class TRetryPolicyIgnoringLockConflicts
  210. : public TAttemptLimitedRetryPolicy
  211. {
  212. public:
  213. using TAttemptLimitedRetryPolicy::TAttemptLimitedRetryPolicy;
  214. using TAttemptLimitedRetryPolicy::OnGenericError;
  215. TMaybe<TDuration> OnRetriableError(const TErrorResponse& e) override
  216. {
  217. if (IsAttemptLimitExceeded()) {
  218. return Nothing();
  219. }
  220. if (e.IsConcurrentTransactionLockConflict()) {
  221. return GetBackoffDuration(Config_);
  222. }
  223. return TAttemptLimitedRetryPolicy::OnRetriableError(e);
  224. }
  225. };
  226. ////////////////////////////////////////////////////////////////////////////////
  227. class TFileToUpload
  228. : public IItemToUpload
  229. {
  230. public:
  231. TFileToUpload(TString fileName, TMaybe<TString> md5)
  232. : FileName_(std::move(fileName))
  233. , MD5_(std::move(md5))
  234. { }
  235. TString CalculateMD5() const override
  236. {
  237. if (MD5_) {
  238. return *MD5_;
  239. }
  240. constexpr size_t md5Size = 32;
  241. TString result;
  242. result.ReserveAndResize(md5Size);
  243. MD5::File(FileName_.data(), result.Detach());
  244. MD5_ = result;
  245. return result;
  246. }
  247. std::unique_ptr<IInputStream> CreateInputStream() const override
  248. {
  249. return std::make_unique<TFileInput>(FileName_);
  250. }
  251. TString GetDescription() const override
  252. {
  253. return FileName_;
  254. }
  255. i64 GetDataSize() const override
  256. {
  257. return GetFileLength(FileName_);
  258. }
  259. private:
  260. TString FileName_;
  261. mutable TMaybe<TString> MD5_;
  262. };
  263. class TDataToUpload
  264. : public IItemToUpload
  265. {
  266. public:
  267. TDataToUpload(TString data, TString description)
  268. : Data_(std::move(data))
  269. , Description_(std::move(description))
  270. { }
  271. TString CalculateMD5() const override
  272. {
  273. constexpr size_t md5Size = 32;
  274. TString result;
  275. result.ReserveAndResize(md5Size);
  276. MD5::Data(reinterpret_cast<const unsigned char*>(Data_.data()), Data_.size(), result.Detach());
  277. return result;
  278. }
  279. std::unique_ptr<IInputStream> CreateInputStream() const override
  280. {
  281. return std::make_unique<TMemoryInput>(Data_.data(), Data_.size());
  282. }
  283. TString GetDescription() const override
  284. {
  285. return Description_;
  286. }
  287. i64 GetDataSize() const override
  288. {
  289. return std::ssize(Data_);
  290. }
  291. private:
  292. TString Data_;
  293. TString Description_;
  294. };
  295. ////////////////////////////////////////////////////////////////////////////////
  296. static const TString& GetPersistentExecPathMd5()
  297. {
  298. static TString md5 = MD5::File(GetPersistentExecPath());
  299. return md5;
  300. }
  301. static TMaybe<TSmallJobFile> GetJobState(const IJob& job)
  302. {
  303. TString result;
  304. {
  305. TStringOutput output(result);
  306. job.Save(output);
  307. output.Finish();
  308. }
  309. if (result.empty()) {
  310. return Nothing();
  311. } else {
  312. return TSmallJobFile{"jobstate", result};
  313. }
  314. }
  315. ////////////////////////////////////////////////////////////////////////////////
  316. TJobPreparer::TJobPreparer(
  317. TOperationPreparer& operationPreparer,
  318. const TUserJobSpec& spec,
  319. const IJob& job,
  320. size_t outputTableCount,
  321. const TVector<TSmallJobFile>& smallFileList,
  322. const TOperationOptions& options)
  323. : RawClient_(operationPreparer.GetClient()->GetRawClient())
  324. , OperationPreparer_(operationPreparer)
  325. , Spec_(spec)
  326. , Options_(options)
  327. , Layers_(spec.Layers_)
  328. {
  329. CreateStorage();
  330. auto cypressFileList = NRawClient::CanonizeYPaths(RawClient_, spec.Files_);
  331. for (const auto& file : cypressFileList) {
  332. UseFileInCypress(file);
  333. }
  334. for (const auto& localFile : spec.GetLocalFiles()) {
  335. UploadLocalFile(std::get<0>(localFile), std::get<1>(localFile));
  336. }
  337. auto jobStateSmallFile = GetJobState(job);
  338. if (jobStateSmallFile) {
  339. UploadSmallFile(*jobStateSmallFile);
  340. }
  341. for (const auto& smallFile : smallFileList) {
  342. UploadSmallFile(smallFile);
  343. }
  344. if (auto commandJob = dynamic_cast<const ICommandJob*>(&job)) {
  345. IsCommandJob_ = true;
  346. ClassName_ = TJobFactory::Get()->GetJobName(&job);
  347. Command_ = commandJob->GetCommand();
  348. } else {
  349. PrepareJobBinary(job, outputTableCount, jobStateSmallFile.Defined());
  350. }
  351. operationPreparer.LockFiles(&CachedFiles_);
  352. }
  353. TVector<TRichYPath> TJobPreparer::GetFiles() const
  354. {
  355. TVector<TRichYPath> allFiles = CypressFiles_;
  356. allFiles.insert(allFiles.end(), CachedFiles_.begin(), CachedFiles_.end());
  357. return allFiles;
  358. }
  359. TVector<TYPath> TJobPreparer::GetLayers() const
  360. {
  361. return Layers_;
  362. }
  363. const TString& TJobPreparer::GetClassName() const
  364. {
  365. return ClassName_;
  366. }
  367. const TString& TJobPreparer::GetCommand() const
  368. {
  369. return Command_;
  370. }
  371. const TUserJobSpec& TJobPreparer::GetSpec() const
  372. {
  373. return Spec_;
  374. }
  375. bool TJobPreparer::ShouldMountSandbox() const
  376. {
  377. return OperationPreparer_.GetContext().Config->MountSandboxInTmpfs || Options_.MountSandboxInTmpfs_;
  378. }
  379. ui64 TJobPreparer::GetTotalFileSize() const
  380. {
  381. return TotalFileSize_;
  382. }
  383. bool TJobPreparer::ShouldRedirectStdoutToStderr() const
  384. {
  385. return !IsCommandJob_ && OperationPreparer_.GetContext().Config->RedirectStdoutToStderr;
  386. }
  387. TString TJobPreparer::GetFileStorage() const
  388. {
  389. return Options_.FileStorage_ ?
  390. *Options_.FileStorage_ :
  391. OperationPreparer_.GetContext().Config->RemoteTempFilesDirectory;
  392. }
  393. TYPath TJobPreparer::GetCachePath() const
  394. {
  395. return AddPathPrefix(
  396. ::TStringBuilder() << GetFileStorage() << "/new_cache",
  397. OperationPreparer_.GetContext().Config->Prefix);
  398. }
  399. void TJobPreparer::CreateStorage() const
  400. {
  401. RequestWithRetry<void>(
  402. OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(),
  403. [this] (TMutationId& mutationId) {
  404. RawClient_->Create(
  405. mutationId,
  406. Options_.FileStorageTransactionId_,
  407. GetCachePath(),
  408. NT_MAP,
  409. TCreateOptions()
  410. .IgnoreExisting(true)
  411. .Recursive(true));
  412. });
  413. }
  414. int TJobPreparer::GetFileCacheReplicationFactor() const
  415. {
  416. if (IsLocalMode()) {
  417. return 1;
  418. } else {
  419. return OperationPreparer_.GetContext().Config->FileCacheReplicationFactor;
  420. }
  421. }
  422. void TJobPreparer::CreateFileInCypress(const TString& path) const
  423. {
  424. auto attributes = TNode()("replication_factor", GetFileCacheReplicationFactor());
  425. if (Options_.FileExpirationTimeout_) {
  426. attributes["expiration_timeout"] = Options_.FileExpirationTimeout_->MilliSeconds();
  427. }
  428. RequestWithRetry<void>(
  429. OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(),
  430. [this, &path, &attributes] (TMutationId& mutationId) {
  431. RawClient_->Create(
  432. mutationId,
  433. Options_.FileStorageTransactionId_,
  434. path,
  435. NT_FILE,
  436. TCreateOptions()
  437. .IgnoreExisting(true)
  438. .Recursive(true)
  439. .Attributes(attributes));
  440. });
  441. }
  442. TString TJobPreparer::PutFileToCypressCache(
  443. const TString& path,
  444. const TString& md5Signature,
  445. TTransactionId transactionId) const
  446. {
  447. constexpr ui32 LockConflictRetryCount = 30;
  448. auto retryPolicy = MakeIntrusive<TRetryPolicyIgnoringLockConflicts>(
  449. LockConflictRetryCount,
  450. OperationPreparer_.GetContext().Config);
  451. auto options = TPutFileToCacheOptions();
  452. if (Options_.FileExpirationTimeout_) {
  453. options.PreserveExpirationTimeout(true);
  454. }
  455. auto cachePath = RequestWithRetry<TYPath>(
  456. retryPolicy,
  457. [this, &path, &md5Signature, &transactionId, &options] (TMutationId /*mutationId*/) {
  458. return RawClient_->PutFileToCache(transactionId, path, md5Signature, GetCachePath(), options);
  459. });
  460. RequestWithRetry<void>(
  461. OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(),
  462. [this, &transactionId, &path] (TMutationId& mutationId) {
  463. RawClient_->Remove(mutationId, transactionId, path, TRemoveOptions().Force(true));
  464. });
  465. return cachePath;
  466. }
  467. TMaybe<TString> TJobPreparer::GetItemFromCypressCache(const TString& md5Signature, const TString& fileName) const
  468. {
  469. constexpr ui32 LockConflictRetryCount = 30;
  470. auto retryPolicy = MakeIntrusive<TRetryPolicyIgnoringLockConflicts>(
  471. LockConflictRetryCount,
  472. OperationPreparer_.GetContext().Config);
  473. auto maybePath = RequestWithRetry<TMaybe<TYPath>>(
  474. retryPolicy,
  475. [this, &md5Signature] (TMutationId /*mutationId*/) {
  476. return RawClient_->GetFileFromCache(TTransactionId(), md5Signature, GetCachePath());
  477. });
  478. if (maybePath) {
  479. YT_LOG_DEBUG(
  480. "File is already in cache (FileName: %v, FilePath: %v)",
  481. fileName,
  482. *maybePath);
  483. }
  484. return maybePath;
  485. }
  486. TDuration TJobPreparer::GetWaitForUploadTimeout(const IItemToUpload& itemToUpload) const
  487. {
  488. TDuration extraTime = OperationPreparer_.GetContext().Config->WaitLockPollInterval +
  489. TDuration::MilliSeconds(100);
  490. auto dataSizeGb = (itemToUpload.GetDataSize() + 1_GB - 1) / 1_GB;
  491. dataSizeGb = Max<ui64>(dataSizeGb, 1);
  492. return extraTime + dataSizeGb * OperationPreparer_.GetContext().Config->CacheLockTimeoutPerGb;
  493. }
  494. TString TJobPreparer::UploadToRandomPath(const IItemToUpload& itemToUpload) const
  495. {
  496. TString uniquePath = AddPathPrefix(
  497. ::TStringBuilder() << GetFileStorage() << "/cpp_" << CreateGuidAsString(),
  498. OperationPreparer_.GetContext().Config->Prefix);
  499. YT_LOG_INFO("Uploading file to random cypress path (FileName: %v; CypressPath: %v; PreparationId: %v)",
  500. itemToUpload.GetDescription(),
  501. uniquePath,
  502. OperationPreparer_.GetPreparationId());
  503. CreateFileInCypress(uniquePath);
  504. {
  505. TFileWriter writer(
  506. uniquePath,
  507. OperationPreparer_.GetClient()->GetRawClient(),
  508. OperationPreparer_.GetClientRetryPolicy(),
  509. OperationPreparer_.GetClient()->GetTransactionPinger(),
  510. OperationPreparer_.GetContext(),
  511. Options_.FileStorageTransactionId_,
  512. TFileWriterOptions().ComputeMD5(true));
  513. itemToUpload.CreateInputStream()->ReadAll(writer);
  514. writer.Finish();
  515. }
  516. return uniquePath;
  517. }
  518. TMaybe<TString> TJobPreparer::TryUploadWithDeduplication(const IItemToUpload& itemToUpload) const
  519. {
  520. const auto md5Signature = itemToUpload.CalculateMD5();
  521. auto fileName = ::TStringBuilder() << GetFileStorage() << "/cpp_md5_" << md5Signature;
  522. if (OperationPreparer_.GetContext().Config->CacheUploadDeduplicationMode == EUploadDeduplicationMode::Host) {
  523. fileName << "_" << MD5::Data(TProcessState::Get()->FqdnHostName);
  524. }
  525. TString cypressPath = AddPathPrefix(fileName, OperationPreparer_.GetContext().Config->Prefix);
  526. CreateFileInCypress(cypressPath);
  527. auto uploadTx = MakeIntrusive<TTransaction>(
  528. OperationPreparer_.GetClient()->GetRawClient(),
  529. OperationPreparer_.GetClient(),
  530. OperationPreparer_.GetContext(),
  531. TTransactionId(),
  532. TStartTransactionOptions());
  533. ILockPtr lock;
  534. try {
  535. lock = uploadTx->Lock(cypressPath, ELockMode::LM_EXCLUSIVE, TLockOptions().Waitable(true));
  536. } catch (const TErrorResponse& e) {
  537. if (e.IsResolveError()) {
  538. // If the node doesn't exist, it must be removed by concurrent uploading process.
  539. // Let's try to find it in the cache.
  540. return GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription());
  541. }
  542. throw;
  543. }
  544. auto waitTimeout = GetWaitForUploadTimeout(itemToUpload);
  545. YT_LOG_DEBUG("Waiting for the lock on file (FileName: %v; CypressPath: %v; LockTimeout: %v)",
  546. itemToUpload.GetDescription(),
  547. cypressPath,
  548. waitTimeout);
  549. if (!TWaitProxy::Get()->WaitFuture(lock->GetAcquiredFuture(), waitTimeout)) {
  550. YT_LOG_DEBUG("Waiting for the lock timed out. Fallback to random path uploading (FileName: %v; CypressPath: %v)",
  551. itemToUpload.GetDescription(),
  552. cypressPath);
  553. return Nothing();
  554. }
  555. YT_LOG_DEBUG("Exclusive lock successfully acquired (FileName: %v; CypressPath: %v)",
  556. itemToUpload.GetDescription(),
  557. cypressPath);
  558. // Ensure that this process is the first to take a lock.
  559. if (auto cachedItemPath = GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription())) {
  560. return *cachedItemPath;
  561. }
  562. YT_LOG_INFO("Uploading file to cypress (FileName: %v; CypressPath: %v; PreparationId: %v)",
  563. itemToUpload.GetDescription(),
  564. cypressPath,
  565. OperationPreparer_.GetPreparationId());
  566. {
  567. auto writer = uploadTx->CreateFileWriter(cypressPath, TFileWriterOptions().ComputeMD5(true));
  568. YT_VERIFY(writer);
  569. itemToUpload.CreateInputStream()->ReadAll(*writer);
  570. writer->Finish();
  571. }
  572. auto path = PutFileToCypressCache(cypressPath, md5Signature, uploadTx->GetId());
  573. uploadTx->Commit();
  574. return path;
  575. }
  576. TString TJobPreparer::UploadToCacheUsingApi(const IItemToUpload& itemToUpload) const
  577. {
  578. auto md5Signature = itemToUpload.CalculateMD5();
  579. Y_ABORT_UNLESS(md5Signature.size() == 32);
  580. if (auto cachedItemPath = GetItemFromCypressCache(md5Signature, itemToUpload.GetDescription())) {
  581. return *cachedItemPath;
  582. }
  583. YT_LOG_INFO("File not found in cache; uploading to cypress (FileName: %v; PreparationId: %v)",
  584. itemToUpload.GetDescription(),
  585. OperationPreparer_.GetPreparationId());
  586. const auto& config = OperationPreparer_.GetContext().Config;
  587. if (config->CacheUploadDeduplicationMode != EUploadDeduplicationMode::Disabled &&
  588. itemToUpload.GetDataSize() > config->CacheUploadDeduplicationThreshold) {
  589. if (auto path = TryUploadWithDeduplication(itemToUpload)) {
  590. return *path;
  591. }
  592. }
  593. auto path = UploadToRandomPath(itemToUpload);
  594. return PutFileToCypressCache(path, md5Signature, Options_.FileStorageTransactionId_);
  595. }
  596. TString TJobPreparer::UploadToCache(const IItemToUpload& itemToUpload) const
  597. {
  598. YT_LOG_INFO("Uploading file (FileName: %v; PreparationId: %v)",
  599. itemToUpload.GetDescription(),
  600. OperationPreparer_.GetPreparationId());
  601. TString result;
  602. switch (Options_.FileCacheMode_) {
  603. case TOperationOptions::EFileCacheMode::ApiCommandBased:
  604. Y_ENSURE_EX(Options_.FileStorageTransactionId_.IsEmpty(), TApiUsageError() <<
  605. "Default cache mode (API command-based) doesn't allow non-default 'FileStorageTransactionId_'");
  606. result = UploadToCacheUsingApi(itemToUpload);
  607. break;
  608. case TOperationOptions::EFileCacheMode::CachelessRandomPathUpload:
  609. result = UploadToRandomPath(itemToUpload);
  610. break;
  611. default:
  612. Y_ABORT("Unknown file cache mode: %d", static_cast<int>(Options_.FileCacheMode_));
  613. }
  614. YT_LOG_INFO("Complete uploading file (FileName: %v; PreparationId: %v)",
  615. itemToUpload.GetDescription(),
  616. OperationPreparer_.GetPreparationId());
  617. return result;
  618. }
  619. void TJobPreparer::UseFileInCypress(const TRichYPath& file)
  620. {
  621. auto exists = RequestWithRetry<bool>(
  622. OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(),
  623. [this, &file] (TMutationId /*mutationId*/) {
  624. return RawClient_->Exists(
  625. file.TransactionId_.GetOrElse(OperationPreparer_.GetTransactionId()),
  626. file.Path_);
  627. });
  628. if (!exists)
  629. {
  630. ythrow yexception() << "File " << file.Path_ << " does not exist";
  631. }
  632. if (ShouldMountSandbox()) {
  633. auto size = RequestWithRetry<i64>(
  634. OperationPreparer_.GetClientRetryPolicy()->CreatePolicyForGenericRequest(),
  635. [this, &file] (TMutationId /*mutationId*/) {
  636. return RawClient_->Get(
  637. file.TransactionId_.GetOrElse(OperationPreparer_.GetTransactionId()),
  638. file.Path_ + "/@uncompressed_data_size")
  639. .AsInt64();
  640. });
  641. TotalFileSize_ += RoundUpFileSize(static_cast<ui64>(size));
  642. }
  643. CypressFiles_.push_back(file);
  644. }
  645. void TJobPreparer::UploadLocalFile(
  646. const TLocalFilePath& localPath,
  647. const TAddLocalFileOptions& options,
  648. bool isApiFile)
  649. {
  650. TFsPath fsPath(localPath);
  651. fsPath.CheckExists();
  652. TFileStat stat;
  653. fsPath.Stat(stat);
  654. bool isExecutable = stat.Mode & (S_IXUSR | S_IXGRP | S_IXOTH);
  655. auto cachePath = UploadToCache(TFileToUpload(localPath, options.MD5CheckSum_));
  656. TRichYPath cypressPath;
  657. if (isApiFile) {
  658. cypressPath = OperationPreparer_.GetContext().Config->ApiFilePathOptions;
  659. }
  660. cypressPath.Path(cachePath).FileName(options.PathInJob_.GetOrElse(fsPath.Basename()));
  661. if (isExecutable) {
  662. cypressPath.Executable(true);
  663. }
  664. if (options.BypassArtifactCache_) {
  665. cypressPath.BypassArtifactCache(*options.BypassArtifactCache_);
  666. }
  667. if (ShouldMountSandbox()) {
  668. TotalFileSize_ += RoundUpFileSize(stat.Size);
  669. }
  670. CachedFiles_.push_back(cypressPath);
  671. }
  672. void TJobPreparer::UploadBinary(const TJobBinaryConfig& jobBinary)
  673. {
  674. if (std::holds_alternative<TJobBinaryLocalPath>(jobBinary)) {
  675. auto binaryLocalPath = std::get<TJobBinaryLocalPath>(jobBinary);
  676. auto opts = TAddLocalFileOptions().PathInJob("cppbinary");
  677. if (binaryLocalPath.MD5CheckSum) {
  678. opts.MD5CheckSum(*binaryLocalPath.MD5CheckSum);
  679. }
  680. UploadLocalFile(binaryLocalPath.Path, opts, /* isApiFile */ true);
  681. } else if (std::holds_alternative<TJobBinaryCypressPath>(jobBinary)) {
  682. auto binaryCypressPath = std::get<TJobBinaryCypressPath>(jobBinary);
  683. TRichYPath ytPath = OperationPreparer_.GetContext().Config->ApiFilePathOptions;
  684. ytPath.Path(binaryCypressPath.Path);
  685. if (binaryCypressPath.TransactionId) {
  686. ytPath.TransactionId(*binaryCypressPath.TransactionId);
  687. }
  688. UseFileInCypress(ytPath.FileName("cppbinary").Executable(true));
  689. } else {
  690. Y_ABORT("%s", (::TStringBuilder() << "Unexpected jobBinary tag: " << jobBinary.index()).data());
  691. }
  692. }
  693. void TJobPreparer::UploadSmallFile(const TSmallJobFile& smallFile)
  694. {
  695. auto cachePath = UploadToCache(TDataToUpload(smallFile.Data, smallFile.FileName + " [generated-file]"));
  696. auto path = OperationPreparer_.GetContext().Config->ApiFilePathOptions;
  697. CachedFiles_.push_back(path.Path(cachePath).FileName(smallFile.FileName));
  698. if (ShouldMountSandbox()) {
  699. TotalFileSize_ += RoundUpFileSize(smallFile.Data.size());
  700. }
  701. }
  702. bool TJobPreparer::IsLocalMode() const
  703. {
  704. return UseLocalModeOptimization(
  705. OperationPreparer_.GetClient()->GetRawClient(),
  706. OperationPreparer_.GetContext(),
  707. OperationPreparer_.GetClientRetryPolicy());
  708. }
  709. void TJobPreparer::PrepareJobBinary(const IJob& job, int outputTableCount, bool hasState)
  710. {
  711. auto jobBinary = TJobBinaryConfig();
  712. if (!std::holds_alternative<TJobBinaryDefault>(Spec_.GetJobBinary())) {
  713. jobBinary = Spec_.GetJobBinary();
  714. }
  715. TString binaryPathInsideJob;
  716. if (std::holds_alternative<TJobBinaryDefault>(jobBinary)) {
  717. if (GetInitStatus() != EInitStatus::FullInitialization) {
  718. ythrow yexception() << "NYT::Initialize() must be called prior to any operation";
  719. }
  720. const bool isLocalMode = IsLocalMode();
  721. const TMaybe<TString> md5 = !isLocalMode ? MakeMaybe(GetPersistentExecPathMd5()) : Nothing();
  722. jobBinary = TJobBinaryLocalPath{GetPersistentExecPath(), md5};
  723. if (isLocalMode) {
  724. binaryPathInsideJob = GetExecPath();
  725. }
  726. } else if (std::holds_alternative<TJobBinaryLocalPath>(jobBinary)) {
  727. const bool isLocalMode = IsLocalMode();
  728. if (isLocalMode) {
  729. binaryPathInsideJob = TFsPath(std::get<TJobBinaryLocalPath>(jobBinary).Path).RealPath();
  730. }
  731. }
  732. Y_ASSERT(!std::holds_alternative<TJobBinaryDefault>(jobBinary));
  733. // binaryPathInsideJob is only set when LocalModeOptimization option is on, so upload is not needed
  734. if (!binaryPathInsideJob) {
  735. binaryPathInsideJob = "./cppbinary";
  736. UploadBinary(jobBinary);
  737. }
  738. TString jobCommandPrefix = Options_.JobCommandPrefix_;
  739. if (!Spec_.JobCommandPrefix_.empty()) {
  740. jobCommandPrefix = Spec_.JobCommandPrefix_;
  741. }
  742. TString jobCommandSuffix = Options_.JobCommandSuffix_;
  743. if (!Spec_.JobCommandSuffix_.empty()) {
  744. jobCommandSuffix = Spec_.JobCommandSuffix_;
  745. }
  746. ClassName_ = TJobFactory::Get()->GetJobName(&job);
  747. auto jobArguments = TNode::CreateMap();
  748. jobArguments["job_name"] = ClassName_;
  749. jobArguments["output_table_count"] = static_cast<i64>(outputTableCount);
  750. jobArguments["has_state"] = hasState;
  751. Spec_.AddEnvironment("YT_JOB_ARGUMENTS", NodeToYsonString(jobArguments));
  752. Command_ = ::TStringBuilder() <<
  753. jobCommandPrefix <<
  754. (OperationPreparer_.GetContext().Config->UseClientProtobuf ? "YT_USE_CLIENT_PROTOBUF=1" : "YT_USE_CLIENT_PROTOBUF=0") << " " <<
  755. binaryPathInsideJob <<
  756. jobCommandSuffix;
  757. }
  758. ////////////////////////////////////////////////////////////////////////////////
  759. } // namespace NYT::NDetail