file_udf.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. #include <yql/essentials/public/udf/udf_helpers.h>
  2. #include <yql/essentials/utils/line_split.h>
  3. #include <util/generic/yexception.h>
  4. #include <util/stream/buffered.h>
  5. #include <util/stream/file.h>
  6. #include <util/string/cast.h>
  7. #include <util/ysaveload.h>
  8. #include <functional>
  9. using namespace NKikimr;
  10. using namespace NUdf;
  11. extern const char ByLineFuncName[];
  12. const char ByLineFuncName[] = "ByLines";
  13. namespace {
  14. namespace Helper {
  15. template <class TUserType>
  16. inline bool ConvertToUnboxed(const IValueBuilder& valueBuilder, const TString& curLine, TUnboxedValue& result) {
  17. Y_UNUSED(valueBuilder);
  18. TUserType userType;
  19. if (!TryFromString<TUserType>(curLine, userType)) {
  20. return false;
  21. }
  22. result = TUnboxedValuePod(userType);
  23. return true;
  24. }
  25. template <>
  26. inline bool ConvertToUnboxed<const char*>(const IValueBuilder& valueBuilder, const TString& curLine, TUnboxedValue& result) {
  27. result = valueBuilder.NewString(curLine);
  28. return true;
  29. }
  30. template <>
  31. inline bool ConvertToUnboxed<TUtf8>(const IValueBuilder& valueBuilder, const TString& curLine, TUnboxedValue& result) {
  32. result = valueBuilder.NewString(curLine);
  33. return true;
  34. }
  35. template <>
  36. inline bool ConvertToUnboxed<TYson>(const IValueBuilder& valueBuilder, const TString& curLine, TUnboxedValue& result) {
  37. result = valueBuilder.NewString(curLine);
  38. return true;
  39. }
  40. template <>
  41. inline bool ConvertToUnboxed<TJson>(const IValueBuilder& valueBuilder, const TString& curLine, TUnboxedValue& result) {
  42. result = valueBuilder.NewString(curLine);
  43. return true;
  44. }
  45. template <typename T>
  46. struct TypeToTypeName {
  47. static const char* Name() {
  48. return "Unknown";
  49. }
  50. };
  51. template <>
  52. struct TypeToTypeName<bool> {
  53. static constexpr const char* Name() {
  54. return "Bool";
  55. }
  56. };
  57. template <>
  58. struct TypeToTypeName<i8> {
  59. static constexpr const char* Name() {
  60. return "Int8";
  61. }
  62. };
  63. template <>
  64. struct TypeToTypeName<ui8> {
  65. static constexpr const char* Name() {
  66. return "Uint8";
  67. }
  68. };
  69. template <>
  70. struct TypeToTypeName<i16> {
  71. static constexpr const char* Name() {
  72. return "Int16";
  73. }
  74. };
  75. template <>
  76. struct TypeToTypeName<ui16> {
  77. static constexpr const char* Name() {
  78. return "Uint16";
  79. }
  80. };
  81. template <>
  82. struct TypeToTypeName<ui32> {
  83. static constexpr const char* Name() {
  84. return "Uint32";
  85. }
  86. };
  87. template <>
  88. struct TypeToTypeName<ui64> {
  89. static constexpr const char* Name() {
  90. return "Uint64";
  91. }
  92. };
  93. template <>
  94. struct TypeToTypeName<i32> {
  95. static constexpr const char* Name() {
  96. return "Int32";
  97. }
  98. };
  99. template <>
  100. struct TypeToTypeName<i64> {
  101. static constexpr const char* Name() {
  102. return "Int64";
  103. }
  104. };
  105. template <>
  106. struct TypeToTypeName<float> {
  107. static constexpr const char* Name() {
  108. return "Float";
  109. }
  110. };
  111. template <>
  112. struct TypeToTypeName<double> {
  113. static constexpr const char* Name() {
  114. return "Double";
  115. }
  116. };
  117. template <>
  118. struct TypeToTypeName<const char*> {
  119. static constexpr const char* Name() {
  120. return "String";
  121. }
  122. };
  123. template <>
  124. struct TypeToTypeName<TUtf8> {
  125. static constexpr const char* Name() {
  126. return "Utf8";
  127. }
  128. };
  129. template <>
  130. struct TypeToTypeName<TYson> {
  131. static constexpr const char* Name() {
  132. return "Yson";
  133. }
  134. };
  135. template <>
  136. struct TypeToTypeName<TJson> {
  137. static constexpr const char* Name() {
  138. return "Json";
  139. }
  140. };
  141. }
  142. static const ui64 TAKE_UNLIM = -1;
  143. bool SkipElements(IBoxedValue& iter, ui64 skip) {
  144. for (; skip > 0; --skip) {
  145. if (!TBoxedValueAccessor::Skip(iter)) {
  146. return false;
  147. }
  148. }
  149. return true;
  150. }
  151. typedef std::function<void(const TString& message)> TTerminateFunc;
  152. class TStreamMeta: public TThrRefBase {
  153. public:
  154. typedef TBuffered<TUnbufferedFileInput> TStream;
  155. typedef TIntrusivePtr<TStreamMeta> TPtr;
  156. TStreamMeta(TString filePath)
  157. : FilePath(filePath)
  158. {
  159. // work in greedy mode to catch error on creation
  160. Cached = DoCreateStream();
  161. }
  162. std::unique_ptr<TStream> CreateStream(TTerminateFunc terminateFunc) {
  163. if (Cached) {
  164. return std::move(Cached);
  165. }
  166. terminateFunc("The file iterator was already created. To scan file data multiple times please use ListCollect either over ParseFile or over some lazy function over it, e.g. ListMap");
  167. Y_ABORT("Terminate unstoppable!");
  168. }
  169. bool GetLinesCount(ui64& count) const {
  170. if (LinesCount == Unknown)
  171. return false;
  172. count = LinesCount;
  173. return true;
  174. }
  175. void SetLinesCount(ui64 count) {
  176. Y_DEBUG_ABORT_UNLESS(LinesCount == Unknown || count == LinesCount, "Set another value of count lines");
  177. if (LinesCount == Unknown) {
  178. LinesCount = count;
  179. }
  180. }
  181. const TString& GetFilePath() const {
  182. return FilePath;
  183. }
  184. private:
  185. std::unique_ptr<TStream> DoCreateStream() {
  186. static const auto bufferSize = 1 << 12;
  187. TFile file(FilePath, OpenExisting | RdOnly | Seq);
  188. if (FileSize == Unknown) {
  189. FileSize = file.GetLength();
  190. }
  191. return std::make_unique<TBuffered<TUnbufferedFileInput>>(bufferSize, file);
  192. }
  193. TString FilePath;
  194. static const ui64 Unknown = -1;
  195. ui64 FileSize = Unknown;
  196. ui64 LinesCount = Unknown;
  197. std::unique_ptr<TStream> Cached;
  198. };
  199. class TEmptyIter: public TBoxedValue {
  200. private:
  201. bool Skip() override {
  202. return false;
  203. }
  204. bool Next(TUnboxedValue&) override {
  205. return false;
  206. }
  207. public:
  208. TEmptyIter(TTerminateFunc terminateFunc)
  209. : TerminateFunc(terminateFunc)
  210. {
  211. }
  212. private:
  213. const TTerminateFunc TerminateFunc;
  214. };
  215. template <class TUserType>
  216. class TLineByLineBoxedValueIterator: public TBoxedValue {
  217. public:
  218. TLineByLineBoxedValueIterator(TStreamMeta::TPtr metaPtr, std::unique_ptr<TStreamMeta::TStream>&& stream, const IValueBuilder& valueBuilder, TTerminateFunc terminateFunc)
  219. : MetaPtr(metaPtr)
  220. , ValueBuilder(valueBuilder)
  221. , Stream(std::move(stream))
  222. , Splitter(*Stream)
  223. , TerminateFunc(terminateFunc)
  224. {
  225. }
  226. void SetLimit(ui64 limit = TAKE_UNLIM) {
  227. Limit = limit;
  228. }
  229. private:
  230. bool SkipLimit() {
  231. if (Limit != TAKE_UNLIM) {
  232. if (Limit == 0) {
  233. return false;
  234. }
  235. --Limit;
  236. }
  237. return true;
  238. }
  239. bool Skip() final {
  240. ++CurLineNum;
  241. return Splitter.Next(CurLine) && SkipLimit();
  242. }
  243. bool Next(TUnboxedValue& value) override {
  244. if (!Skip()) {
  245. return false;
  246. }
  247. if (!Helper::ConvertToUnboxed<TUserType>(ValueBuilder, CurLine, value)) {
  248. TStringBuilder sb;
  249. sb << "File::ByLines failed to cast string '" << CurLine << "' to " << Helper::TypeToTypeName<TUserType>::Name() << Endl;
  250. sb << "- path: " << MetaPtr->GetFilePath() << Endl;
  251. sb << "- line: " << CurLineNum << Endl;
  252. TerminateFunc(sb);
  253. Y_ABORT("Terminate unstoppable!");
  254. }
  255. return true;
  256. }
  257. TStreamMeta::TPtr MetaPtr;
  258. const IValueBuilder& ValueBuilder;
  259. std::unique_ptr<TStreamMeta::TStream> Stream;
  260. TLineSplitter Splitter;
  261. TTerminateFunc TerminateFunc;
  262. TString CurLine;
  263. ui64 CurLineNum = 0;
  264. ui64 Limit = TAKE_UNLIM;
  265. TUnboxedValue Result;
  266. };
  267. template <class TUserType>
  268. class TListByLineBoxedValue: public TBoxedValue {
  269. public:
  270. TListByLineBoxedValue(TStreamMeta::TPtr metaPtr, const IValueBuilder& valueBuilder, TTerminateFunc terminateFunc, ui64 skip = 0ULL, ui64 take = TAKE_UNLIM)
  271. : MetaPtr(metaPtr)
  272. , ValueBuilder(valueBuilder)
  273. , TerminateFunc(terminateFunc)
  274. , Skip(skip)
  275. , Take(take)
  276. {}
  277. private:
  278. bool HasFastListLength() const override {
  279. ui64 tmp;
  280. return MetaPtr->GetLinesCount(tmp);
  281. }
  282. ui64 GetListLength() const override {
  283. ui64 length;
  284. if (!MetaPtr->GetLinesCount(length)) {
  285. length = Skip;
  286. for (const auto iter = GetListIterator(); iter.Skip(); ++length)
  287. continue;
  288. if (Take == TAKE_UNLIM) {
  289. MetaPtr->SetLinesCount(length);
  290. }
  291. }
  292. if (length <= Skip) {
  293. return 0;
  294. }
  295. return Min(length - Skip, Take);
  296. }
  297. ui64 GetEstimatedListLength() const override {
  298. /// \todo some optimisation?
  299. return GetListLength();
  300. }
  301. TUnboxedValue GetListIterator() const override {
  302. try {
  303. auto stream = MetaPtr->CreateStream(TerminateFunc);
  304. IBoxedValuePtr iter(new TLineByLineBoxedValueIterator<TUserType>(MetaPtr, std::move(stream), ValueBuilder, TerminateFunc));
  305. if (!Take || !SkipElements(*iter, Skip)) {
  306. return TUnboxedValuePod(new TEmptyIter(TerminateFunc));
  307. }
  308. static_cast<TLineByLineBoxedValueIterator<TUserType>*>(iter.Get())->SetLimit(Take);
  309. return TUnboxedValuePod(std::move(iter));
  310. } catch (const std::exception& e) {
  311. TerminateFunc(CurrentExceptionMessage());
  312. Y_ABORT("Terminate unstoppable!");
  313. }
  314. }
  315. IBoxedValuePtr SkipListImpl(const IValueBuilder& builder, ui64 count) const override {
  316. return new TListByLineBoxedValue(MetaPtr, builder, TerminateFunc, Skip + count, Take == TAKE_UNLIM ? TAKE_UNLIM : Take - std::min(Take, count));
  317. }
  318. IBoxedValuePtr TakeListImpl(const IValueBuilder& builder, ui64 count) const override {
  319. return new TListByLineBoxedValue(MetaPtr, builder, TerminateFunc, Skip, std::min(Take, count));
  320. }
  321. bool HasListItems() const override {
  322. return true;
  323. }
  324. TStreamMeta::TPtr MetaPtr;
  325. const IValueBuilder& ValueBuilder;
  326. TTerminateFunc TerminateFunc;
  327. ui64 Skip = 0ULL;
  328. ui64 Take = TAKE_UNLIM;
  329. };
  330. template <class TUserType>
  331. class TByLinesFunc: public TBoxedValue {
  332. private:
  333. TSourcePosition Pos_;
  334. TByLinesFunc(TSourcePosition pos)
  335. : Pos_(pos)
  336. {}
  337. TUnboxedValue Run(const IValueBuilder* valueBuilder, const TUnboxedValuePod* args) const override {
  338. try {
  339. TString filePath(args[0].AsStringRef());
  340. TStreamMeta::TPtr metaPtr(new TStreamMeta(filePath));
  341. auto pos = Pos_;
  342. auto terminateFunc = [pos](const TString& message) {
  343. UdfTerminate((TStringBuilder() << pos << " " << message).data());
  344. };
  345. return TUnboxedValuePod(new TListByLineBoxedValue<TUserType>(metaPtr, *valueBuilder, terminateFunc));
  346. } catch (const std::exception& e) {
  347. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  348. }
  349. }
  350. public:
  351. static void DeclareSignature(
  352. TStringRef name,
  353. TType* userType,
  354. IFunctionTypeInfoBuilder& builder,
  355. bool typesOnly)
  356. {
  357. Y_UNUSED(name);
  358. builder.UserType(userType);
  359. builder.SimpleSignature<TListType<TUserType>(char*)>();
  360. if (!typesOnly) {
  361. builder.Implementation(new TByLinesFunc<TUserType>(builder.GetSourcePosition()));
  362. }
  363. }
  364. };
  365. class TFolderListFromFile: public TBoxedValue {
  366. private:
  367. class TIterator : public TBoxedValue {
  368. public:
  369. TIterator(ui32 indexP, ui32 indexT, ui32 indexA, const IValueBuilder& valueBuilder, const TSourcePosition& pos, TString filePath)
  370. : IndexP_(indexP)
  371. , IndexT_(indexT)
  372. , IndexA_(indexA)
  373. , ValueBuilder_(valueBuilder)
  374. , Pos_(pos)
  375. , Input_(filePath)
  376. {
  377. }
  378. private:
  379. bool Next(NUdf::TUnboxedValue& value) override {
  380. try {
  381. TString type;
  382. TString path;
  383. TString attrs;
  384. ::Load(&Input_, type);
  385. if (!type) {
  386. return false;
  387. }
  388. ::Load(&Input_, path);
  389. ::Load(&Input_, attrs);
  390. NUdf::TUnboxedValue* items = nullptr;
  391. value = ValueBuilder_.NewArray(3, items);
  392. items[IndexT_] = ValueBuilder_.NewString(type);
  393. items[IndexP_] = ValueBuilder_.NewString(path);
  394. items[IndexA_] = ValueBuilder_.NewString(attrs);
  395. }
  396. catch (const std::exception& e) {
  397. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  398. }
  399. return true;
  400. }
  401. private:
  402. const ui32 IndexP_;
  403. const ui32 IndexT_;
  404. const ui32 IndexA_;
  405. const IValueBuilder& ValueBuilder_;
  406. const TSourcePosition Pos_;
  407. TIFStream Input_;
  408. };
  409. class TList: public TBoxedValue {
  410. public:
  411. TList(ui32 indexP, ui32 indexT, ui32 indexA, const IValueBuilder& valueBuilder, const TSourcePosition& pos, TString filePath)
  412. : IndexP_(indexP)
  413. , IndexT_(indexT)
  414. , IndexA_(indexA)
  415. , ValueBuilder_(valueBuilder)
  416. , Pos_(pos)
  417. , FilePath_(std::move(filePath))
  418. {
  419. }
  420. protected:
  421. NUdf::TUnboxedValue GetListIterator() const override {
  422. return NUdf::TUnboxedValuePod(new TIterator(IndexP_, IndexT_, IndexA_, ValueBuilder_, Pos_, FilePath_));
  423. }
  424. bool HasFastListLength() const override {
  425. return bool(Length);
  426. }
  427. ui64 GetListLength() const override {
  428. if (!Length) {
  429. ui64 length = 0ULL;
  430. for (const auto it = GetListIterator(); it.Skip();) {
  431. ++length;
  432. }
  433. Length = length;
  434. }
  435. return *Length;
  436. }
  437. ui64 GetEstimatedListLength() const override {
  438. return GetListLength();
  439. }
  440. bool HasListItems() const override {
  441. if (HasItems) {
  442. return *HasItems;
  443. }
  444. if (Length) {
  445. HasItems = (*Length != 0);
  446. return *HasItems;
  447. }
  448. auto iter = GetListIterator();
  449. HasItems = iter.Skip();
  450. return *HasItems;
  451. }
  452. protected:
  453. const ui32 IndexP_;
  454. const ui32 IndexT_;
  455. const ui32 IndexA_;
  456. const IValueBuilder& ValueBuilder_;
  457. const TSourcePosition Pos_;
  458. const TString FilePath_;
  459. mutable TMaybe<ui64> Length;
  460. mutable TMaybe<bool> HasItems;
  461. };
  462. public:
  463. TFolderListFromFile(ui32 indexP, ui32 indexT, ui32 indexA, const TSourcePosition& pos)
  464. : IndexP_(indexP)
  465. , IndexT_(indexT)
  466. , IndexA_(indexA)
  467. , Pos_(pos)
  468. {
  469. }
  470. static const ::NYql::NUdf::TStringRef& Name() {
  471. static auto name = ::NYql::NUdf::TStringRef::Of("FolderListFromFile");
  472. return name;
  473. }
  474. TUnboxedValue Run(const IValueBuilder* valueBuilder, const TUnboxedValuePod* args) const override {
  475. try {
  476. TString filePath(args[0].AsStringRef());
  477. return TUnboxedValuePod(new TList(IndexP_, IndexT_, IndexA_, *valueBuilder, Pos_, filePath));
  478. } catch (const std::exception& e) {
  479. UdfTerminate((TStringBuilder() << Pos_ << " " << e.what()).data());
  480. }
  481. }
  482. static bool DeclareSignature(const TStringRef& name, TType* userType, IFunctionTypeInfoBuilder& builder, bool typesOnly) {
  483. if (Name() != name) {
  484. // the only case when we return false
  485. return false;
  486. }
  487. builder.UserType(userType);
  488. ui32 indexP, indexT, indexA;
  489. auto itemType = builder.Struct()
  490. ->AddField<const char*>("Path", &indexP)
  491. .AddField<const char*>("Type", &indexT)
  492. .AddField<TYson>("Attributes", &indexA)
  493. .Build();
  494. auto resultType = builder.List()->Item(itemType).Build();
  495. builder.Args()->Add<const char*>().Done().Returns(resultType);
  496. if (!typesOnly) {
  497. builder.Implementation(new TFolderListFromFile(indexP, indexT, indexA, builder.GetSourcePosition()));
  498. }
  499. return true;
  500. }
  501. private:
  502. const ui32 IndexP_;
  503. const ui32 IndexT_;
  504. const ui32 IndexA_;
  505. const TSourcePosition Pos_;
  506. };
  507. SIMPLE_MODULE(TFileModule,
  508. TUserDataTypeFuncFactory<false, false, ByLineFuncName, TByLinesFunc, const char*, TUtf8, TYson, TJson, i8, ui8, i16, ui16, ui32, ui64, i32, i64, float, double, bool>,
  509. TFolderListFromFile
  510. )
  511. }
  512. REGISTER_MODULES(TFileModule)