yql_udf_index.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #include "yql_udf_index.h"
  2. #include <yql/essentials/minikql/mkql_function_registry.h>
  3. namespace NYql {
  4. namespace {
  5. TVector<TResourceInfo::TPtr> ConvertResolveResultToResources(const TResolveResult& resolveResult, const TMap<TString, TString>& pathsWithMd5, bool isTrusted) {
  6. THashMap<TString, size_t> importIndex; // module => Imports index
  7. THashMap<TString, size_t> packageIndex; // package => Imports index
  8. THashMap<TString, TVector<TFunctionInfo>> functionIndex; // package => vector of functions
  9. for (size_t i = 0; i < resolveResult.ImportsSize(); ++i) {
  10. auto& import = resolveResult.GetImports(i);
  11. if (!import.ModulesSize()) {
  12. continue;
  13. }
  14. for (auto& m : import.GetModules()) {
  15. importIndex.emplace(m, i);
  16. }
  17. const TString package = import.GetModules(0);
  18. packageIndex.emplace(package, i);
  19. functionIndex.emplace(package, TVector<TFunctionInfo>());
  20. }
  21. for (auto& udf : resolveResult.GetUdfs()) {
  22. const TString module = TString(NKikimr::NMiniKQL::ModuleName(TStringBuf(udf.GetName())));
  23. const auto& import = resolveResult.GetImports(importIndex.at(module));
  24. const TString package = import.GetModules(0);
  25. TFunctionInfo newFunction;
  26. newFunction.Name = udf.GetName();
  27. newFunction.IsTypeAwareness = udf.GetIsTypeAwareness();
  28. newFunction.ArgCount = udf.GetArgCount();
  29. newFunction.OptionalArgCount = udf.GetOptionalArgCount();
  30. if (udf.HasCallableType()) {
  31. newFunction.CallableType = udf.GetCallableType();
  32. }
  33. if (udf.HasRunConfigType()) {
  34. newFunction.RunConfigType = udf.GetRunConfigType();
  35. }
  36. if (udf.HasIsStrict()) {
  37. newFunction.IsStrict = udf.GetIsStrict();
  38. }
  39. if (udf.HasSupportsBlocks()) {
  40. newFunction.SupportsBlocks = udf.GetSupportsBlocks();
  41. }
  42. functionIndex[package].push_back(newFunction);
  43. }
  44. TVector<TResourceInfo::TPtr> result;
  45. result.reserve(functionIndex.size());
  46. for (auto& p : functionIndex) {
  47. const auto& import = resolveResult.GetImports(packageIndex.at(p.first));
  48. auto info = MakeIntrusive<TResourceInfo>();
  49. info->IsTrusted = isTrusted;
  50. auto md5 = pathsWithMd5.FindPtr(import.GetFileAlias());
  51. info->Link = TDownloadLink::File(import.GetFileAlias(), md5 ? *md5 : "");
  52. info->Modules.insert(import.GetModules().begin(), import.GetModules().end());
  53. info->SetFunctions(p.second);
  54. result.push_back(info);
  55. }
  56. return result;
  57. }
  58. void AddResolveResultToRegistry(const TResolveResult& resolveResult, const TMap<TString, TString>& pathsWithMd5, bool isTrusted, TUdfIndex::EOverrideMode mode, TUdfIndex& registry) {
  59. auto resources = ConvertResolveResultToResources(resolveResult, pathsWithMd5, isTrusted);
  60. registry.RegisterResources(resources, mode);
  61. }
  62. }
  63. TUdfIndex::TUdfIndex() {
  64. }
  65. void TUdfIndex::SetCaseSentiveSearch(bool caseSensitive) {
  66. CaseSensitive_ = caseSensitive;
  67. }
  68. TUdfIndex::TUdfIndex(const TMap<TString, TResourceInfo::TPtr>& resources, bool caseSensitive)
  69. : Resources_(resources)
  70. , CaseSensitive_(caseSensitive)
  71. {
  72. for (const auto& x : Resources_) {
  73. ICaseModules_[to_lower(x.first)].insert(x.first);
  74. }
  75. }
  76. bool TUdfIndex::ContainsModuleStrict(const TString& moduleName) const {
  77. return Resources_.contains(moduleName);
  78. }
  79. bool TUdfIndex::CanonizeModule(TString& moduleName) const {
  80. if (Resources_.contains(moduleName)) {
  81. return true;
  82. }
  83. if (CaseSensitive_) {
  84. return false;
  85. }
  86. auto p = ICaseModules_.FindPtr(to_lower(moduleName));
  87. if (!p) {
  88. return false;
  89. }
  90. Y_ENSURE(p->size() > 0);
  91. if (p->size() > 1) {
  92. return false;
  93. }
  94. moduleName = *p->begin();
  95. return true;
  96. }
  97. TUdfIndex::EStatus TUdfIndex::ContainsModule(const TString& moduleName) const {
  98. if (Resources_.contains(moduleName)) {
  99. return EStatus::Found;
  100. }
  101. if (CaseSensitive_) {
  102. return EStatus::NotFound;
  103. }
  104. auto p = ICaseModules_.FindPtr(to_lower(moduleName));
  105. if (!p) {
  106. return EStatus::NotFound;
  107. }
  108. Y_ENSURE(p->size() > 0);
  109. return p->size() > 1 ? EStatus::Ambigious : EStatus::Found;
  110. }
  111. bool TUdfIndex::ContainsAnyModule(const TSet<TString>& modules) const {
  112. return AnyOf(modules, [this](auto& m) {
  113. return Resources_.contains(m);
  114. });
  115. }
  116. TUdfIndex::EStatus TUdfIndex::FindFunction(const TString& moduleName, const TString& functionName, TFunctionInfo& function) const {
  117. auto r = Resources_.FindPtr(moduleName);
  118. if (!r) {
  119. if (CaseSensitive_) {
  120. return EStatus::NotFound;
  121. }
  122. auto p = ICaseModules_.FindPtr(to_lower(moduleName));
  123. if (!p) {
  124. return EStatus::NotFound;
  125. }
  126. Y_ENSURE(p->size() > 0);
  127. if (p->size() > 1) {
  128. return EStatus::Ambigious;
  129. }
  130. r = Resources_.FindPtr(*p->begin());
  131. Y_ENSURE(r);
  132. }
  133. auto f = (*r)->Functions.FindPtr(functionName);
  134. if (!f) {
  135. if (CaseSensitive_) {
  136. return EStatus::NotFound;
  137. }
  138. auto p = (*r)->ICaseFuncNames.FindPtr(to_lower(functionName));
  139. if (!p) {
  140. return EStatus::NotFound;
  141. }
  142. Y_ENSURE(p->size() > 0);
  143. if (p->size() > 1) {
  144. return EStatus::Ambigious;
  145. }
  146. f = (*r)->Functions.FindPtr(*p->begin());
  147. Y_ENSURE(f);
  148. }
  149. function = *f;
  150. return EStatus::Found;
  151. }
  152. TResourceInfo::TPtr TUdfIndex::FindResourceByModule(const TString& moduleName) const {
  153. auto p = Resources_.FindPtr(moduleName);
  154. if (!p) {
  155. if (CaseSensitive_) {
  156. return nullptr;
  157. }
  158. auto n = ICaseModules_.FindPtr(to_lower(moduleName));
  159. if (!n) {
  160. return nullptr;
  161. }
  162. Y_ENSURE(n->size() > 0);
  163. if (n->size() > 1) {
  164. return nullptr;
  165. }
  166. p = Resources_.FindPtr(*n->begin());
  167. Y_ENSURE(p);
  168. }
  169. return *p;
  170. }
  171. TSet<TResourceInfo::TPtr> TUdfIndex::FindResourcesByModules(const TSet<TString>& modules) const {
  172. TSet<TResourceInfo::TPtr> result;
  173. for (auto& m : modules) {
  174. auto r = FindResourceByModule(m);
  175. if (r) {
  176. result.insert(r);
  177. }
  178. }
  179. return result;
  180. }
  181. void TUdfIndex::UnregisterResource(TResourceInfo::TPtr resource) {
  182. for (auto& m : resource->Modules) {
  183. Resources_.erase(m);
  184. auto& names = ICaseModules_[to_lower(m)];
  185. names.erase(m);
  186. if (names.empty()) {
  187. ICaseModules_.erase(to_lower(m));
  188. }
  189. }
  190. // resource pointer should be alive here to avoid problems with erase
  191. }
  192. void TUdfIndex::RegisterResource(const TResourceInfo::TPtr& resource, EOverrideMode mode) {
  193. Y_ENSURE(resource);
  194. if (resource->Modules.empty()) {
  195. // quite strange, but let's ignore
  196. return;
  197. }
  198. // detect conflict first
  199. if (ContainsAnyModule(resource->Modules)) {
  200. switch (mode) {
  201. case EOverrideMode::PreserveExisting:
  202. return;
  203. case EOverrideMode::RaiseError:
  204. // todo: specify module name(s) in intersection
  205. ythrow yexception() << "Conflict during resource " << resource->Link.Path << " registration";
  206. case EOverrideMode::ReplaceWithNew: {
  207. // we have to find resources and remove all related modules:
  208. // 1. find resources by newModules
  209. // 2. remove all functions related to found resources
  210. auto existingResources = FindResourcesByModules(resource->Modules);
  211. Y_ENSURE(!existingResources.empty());
  212. for (auto& r : existingResources) {
  213. UnregisterResource(r);
  214. }
  215. break;
  216. }
  217. } // switch
  218. }
  219. for (auto& m : resource->Modules) {
  220. Resources_.emplace(m, resource);
  221. ICaseModules_[to_lower(m)].insert(m);
  222. }
  223. }
  224. TIntrusivePtr<TUdfIndex> TUdfIndex::Clone() const {
  225. return new TUdfIndex(Resources_, CaseSensitive_);
  226. }
  227. void TUdfIndex::RegisterResources(const TVector<TResourceInfo::TPtr>& resources, EOverrideMode mode) {
  228. for (auto& r : resources) {
  229. RegisterResource(r, mode);
  230. }
  231. }
  232. void LoadRichMetadataToUdfIndex(const IUdfResolver& resolver, const TVector<TString>& paths, bool isTrusted, TUdfIndex::EOverrideMode mode, TUdfIndex& registry) {
  233. TMap<TString, TString> pathsWithMd5;
  234. for (const auto& path : paths) {
  235. pathsWithMd5[path] = "";
  236. }
  237. LoadRichMetadataToUdfIndex(resolver, pathsWithMd5, isTrusted, mode, registry);
  238. }
  239. void LoadRichMetadataToUdfIndex(const IUdfResolver& resolver, const TMap<TString, TString>& pathsWithMd5, bool isTrusted, TUdfIndex::EOverrideMode mode, TUdfIndex& registry) {
  240. TVector<TString> paths;
  241. paths.reserve(pathsWithMd5.size());
  242. for (const auto& p : pathsWithMd5) {
  243. paths.push_back(p.first);
  244. }
  245. const TResolveResult resolveResult = LoadRichMetadata(resolver, paths);
  246. AddResolveResultToRegistry(resolveResult, pathsWithMd5, isTrusted, mode, registry);
  247. }
  248. void LoadRichMetadataToUdfIndex(const IUdfResolver& resolver, const TVector<TUserDataBlock>& blocks, bool isTrusted, TUdfIndex::EOverrideMode mode, TUdfIndex& registry) {
  249. TVector<TUserDataBlock> blocksResolve;
  250. blocksResolve.reserve(blocks.size());
  251. // we can work with file path only
  252. TMap<TString, TString> pathsWithMd5;
  253. for (auto& b : blocks) {
  254. TString path;
  255. switch (b.Type) {
  256. case EUserDataType::URL:
  257. if (!b.FrozenFile) {
  258. ythrow yexception() << "DataBlock for " << b.Data << " is not frozen";
  259. }
  260. path = b.FrozenFile->GetPath().GetPath();
  261. pathsWithMd5.emplace(path, b.FrozenFile->GetMd5());
  262. break;
  263. case EUserDataType::PATH:
  264. {
  265. TString md5;
  266. if (b.FrozenFile) {
  267. md5 = b.FrozenFile->GetMd5();
  268. }
  269. path = b.Data;
  270. pathsWithMd5.emplace(b.Data, md5);
  271. break;
  272. }
  273. default:
  274. ythrow yexception() << "Unsupport data block type for " << b.Data;
  275. }
  276. TUserDataBlock br;
  277. br.Type = EUserDataType::PATH;
  278. br.Data = path;
  279. br.Usage.Set(EUserDataBlockUsage::Udf);
  280. br.CustomUdfPrefix = b.CustomUdfPrefix;
  281. blocksResolve.emplace_back(br);
  282. }
  283. const TResolveResult resolveResult = LoadRichMetadata(resolver, blocksResolve);
  284. AddResolveResultToRegistry(resolveResult, pathsWithMd5, isTrusted, mode, registry);
  285. }
  286. void LoadRichMetadataToUdfIndex(const IUdfResolver& resolver, const TUserDataBlock& block, TUdfIndex::EOverrideMode mode, TUdfIndex& registry) {
  287. TVector<TUserDataBlock> blocks({ block });
  288. const bool isTrusted = false;
  289. LoadRichMetadataToUdfIndex(resolver, blocks, isTrusted, mode, registry);
  290. }
  291. } // namespace NYql