digest_udf.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. #include <yql/essentials/public/udf/udf_helpers.h>
  2. #include <yql/essentials/public/udf/udf_value_builder.h>
  3. #include <util/digest/murmur.h>
  4. #include <util/digest/city.h>
  5. #include <util/digest/numeric.h>
  6. #include <util/digest/fnv.h>
  7. #include <library/cpp/digest/argonish/argon2.h>
  8. #include <library/cpp/digest/argonish/blake2b.h>
  9. #include <library/cpp/digest/crc32c/crc32c.h>
  10. #include <library/cpp/digest/md5/md5.h>
  11. #include <library/cpp/digest/murmur/murmur.h>
  12. #include <library/cpp/digest/old_crc/crc.h>
  13. #include <library/cpp/digest/sfh/sfh.h>
  14. #include <contrib/libs/highwayhash/highwayhash/c_bindings.h>
  15. #include <contrib/libs/highwayhash/highwayhash/sip_hash.h>
  16. #include <contrib/libs/farmhash/farmhash.h>
  17. #include <contrib/libs/xxhash/xxhash.h>
  18. #include <openssl/sha.h>
  19. using namespace NKikimr;
  20. using namespace NUdf;
  21. namespace {
  22. enum EDigestType {
  23. CRC32C, CRC64, FNV32, FNV64, MURMUR, MURMUR32, MURMUR2A, MURMUR2A32, CITY
  24. };
  25. const char* DigestNames[] = {
  26. "Crc32c", "Crc64", "Fnv32", "Fnv64", "MurMurHash", "MurMurHash32", "MurMurHash2A", "MurMurHash2A32", "CityHash"
  27. };
  28. template<typename TResult>
  29. using TDigestGenerator = TResult(const TStringRef&, TMaybe<TResult> init);
  30. template<EDigestType DigestType, typename TResult, TDigestGenerator<TResult>* Generator>
  31. class TDigestFunctionUdf: public TBoxedValue {
  32. public:
  33. TDigestFunctionUdf(TSourcePosition pos) : Pos_(pos) {}
  34. static TStringRef Name() {
  35. static TString name = DigestNames[DigestType];
  36. return TStringRef(name);
  37. }
  38. static bool DeclareSignature(
  39. const TStringRef& name,
  40. TType*,
  41. IFunctionTypeInfoBuilder& builder,
  42. bool typesOnly)
  43. {
  44. if (Name() != name) {
  45. return false;
  46. }
  47. auto args = builder.Args();
  48. args->Add(builder.SimpleType<char *>()).Flags(ICallablePayload::TArgumentFlags::AutoMap);
  49. args->Add(builder.Optional()->Item(builder.SimpleType<TResult>()).Build()).Name("Init");
  50. args->Done();
  51. builder.OptionalArgs(1);
  52. builder.Returns(builder.SimpleType<TResult>());
  53. builder.IsStrict();
  54. if (!typesOnly) {
  55. builder.Implementation(new TDigestFunctionUdf<DigestType, TResult, Generator>(GetSourcePosition(builder)));
  56. }
  57. return true;
  58. }
  59. private:
  60. TUnboxedValue Run(const IValueBuilder*, const TUnboxedValuePod* args) const final try {
  61. TMaybe<TResult> init = Nothing();
  62. if (auto val = args[1]) {
  63. init = val.Get<TResult>();
  64. }
  65. return TUnboxedValuePod(Generator(args[0].AsStringRef(), init));
  66. } catch (const std ::exception&) {
  67. TStringBuilder sb;
  68. sb << Pos_ << " ";
  69. sb << CurrentExceptionMessage();
  70. sb << Endl << "[" << TStringBuf(Name()) << "]";
  71. UdfTerminate(sb.c_str());
  72. }
  73. TSourcePosition Pos_;
  74. };
  75. SIMPLE_STRICT_UDF(TCrc32c, ui32(TAutoMap<char*>)) {
  76. Y_UNUSED(valueBuilder);
  77. const auto& inputRef = args[0].AsStringRef();
  78. ui32 hash = Crc32c(inputRef.Data(), inputRef.Size());
  79. return TUnboxedValuePod(hash);
  80. }
  81. using TCrc64 = TDigestFunctionUdf<CRC64, ui64, [](auto& inputRef, auto init) {
  82. return crc64(inputRef.Data(), inputRef.Size(), init.GetOrElse(CRC64INIT));
  83. }>;
  84. using TFnv32 = TDigestFunctionUdf<FNV32, ui32, [](auto& inputRef, auto init) {
  85. if (init) {
  86. return FnvHash<ui32>(inputRef.Data(), inputRef.Size(), *init);
  87. } else {
  88. return FnvHash<ui32>(inputRef.Data(), inputRef.Size());
  89. }
  90. }>;
  91. using TFnv64 = TDigestFunctionUdf<FNV64, ui64, [](auto& inputRef, auto init) {
  92. if (init) {
  93. return FnvHash<ui64>(inputRef.Data(), inputRef.Size(), *init);
  94. } else {
  95. return FnvHash<ui64>(inputRef.Data(), inputRef.Size());
  96. }
  97. }>;
  98. using TMurMurHash = TDigestFunctionUdf<MURMUR, ui64, [](auto& inputRef, auto init) {
  99. if (init) {
  100. return MurmurHash<ui64>(inputRef.Data(), inputRef.Size(), *init);
  101. } else {
  102. return MurmurHash<ui64>(inputRef.Data(), inputRef.Size());
  103. }
  104. }>;
  105. using TMurMurHash32 = TDigestFunctionUdf<MURMUR32, ui32, [] (auto& inputRef, auto init) {
  106. if (init) {
  107. return MurmurHash<ui32>(inputRef.Data(), inputRef.Size(), *init);
  108. } else {
  109. return MurmurHash<ui32>(inputRef.Data(), inputRef.Size());
  110. }
  111. }>;
  112. using TMurMurHash2A = TDigestFunctionUdf<MURMUR2A, ui64, [] (auto& inputRef, auto init) {
  113. if (init) {
  114. return TMurmurHash2A<ui64>{*init}.Update(inputRef.Data(), inputRef.Size()).Value();
  115. } else {
  116. return TMurmurHash2A<ui64>{}.Update(inputRef.Data(), inputRef.Size()).Value();
  117. }
  118. }>;
  119. using TMurMurHash2A32 = TDigestFunctionUdf<MURMUR2A32, ui32, [] (auto& inputRef, auto init) {
  120. if (init) {
  121. return TMurmurHash2A<ui32>{*init}.Update(inputRef.Data(), inputRef.Size()).Value();
  122. } else {
  123. return TMurmurHash2A<ui32>{}.Update(inputRef.Data(), inputRef.Size()).Value();
  124. }
  125. }>;
  126. using TCityHash = TDigestFunctionUdf<CITY, ui64, [] (auto& inputRef, auto init) {
  127. if (init) {
  128. return CityHash64WithSeed(inputRef.Data(), inputRef.Size(), *init);
  129. } else {
  130. return CityHash64(inputRef.Data(), inputRef.Size());
  131. }
  132. }>;
  133. class TCityHash128: public TBoxedValue {
  134. public:
  135. static TStringRef Name() {
  136. static auto name = TStringRef::Of("CityHash128");
  137. return name;
  138. }
  139. static bool DeclareSignature(
  140. const TStringRef& name,
  141. TType* userType,
  142. IFunctionTypeInfoBuilder& builder,
  143. bool typesOnly) {
  144. Y_UNUSED(userType);
  145. if (Name() == name) {
  146. auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  147. builder.Args(1)->Add<TAutoMap<char*>>();
  148. builder.Returns(type);
  149. if (!typesOnly) {
  150. builder.Implementation(new TCityHash128);
  151. }
  152. builder.IsStrict();
  153. return true;
  154. } else {
  155. return false;
  156. }
  157. }
  158. private:
  159. TUnboxedValue Run(
  160. const IValueBuilder* valueBuilder,
  161. const TUnboxedValuePod* args) const override {
  162. TUnboxedValue* items = nullptr;
  163. auto val = valueBuilder->NewArray(2U, items);
  164. const auto& inputRef = args[0].AsStringRef();
  165. uint128 hash = CityHash128(inputRef.Data(), inputRef.Size());
  166. items[0] = TUnboxedValuePod(hash.first);
  167. items[1] = TUnboxedValuePod(hash.second);
  168. return val;
  169. }
  170. };
  171. SIMPLE_STRICT_UDF(TNumericHash, ui64(TAutoMap<ui64>)) {
  172. Y_UNUSED(valueBuilder);
  173. ui64 input = args[0].Get<ui64>();
  174. ui64 hash = (ui64)NumericHash(input);
  175. return TUnboxedValuePod(hash);
  176. }
  177. SIMPLE_STRICT_UDF(TMd5Hex, char*(TAutoMap<char*>)) {
  178. const auto& inputRef = args[0].AsStringRef();
  179. MD5 md5;
  180. const TString& hash = md5.Calc(inputRef);
  181. return valueBuilder->NewString(hash);
  182. }
  183. SIMPLE_STRICT_UDF(TMd5Raw, char*(TAutoMap<char*>)) {
  184. const auto& inputRef = args[0].AsStringRef();
  185. MD5 md5;
  186. const TString& hash = md5.CalcRaw(inputRef);
  187. return valueBuilder->NewString(hash);
  188. }
  189. SIMPLE_STRICT_UDF(TMd5HalfMix, ui64(TAutoMap<char*>)) {
  190. Y_UNUSED(valueBuilder);
  191. return TUnboxedValuePod(MD5::CalcHalfMix(args[0].AsStringRef()));
  192. }
  193. SIMPLE_STRICT_UDF(TArgon2, char*(TAutoMap<char*>, TAutoMap<char*>)) {
  194. const static ui32 outSize = 32;
  195. const static NArgonish::TArgon2Factory afactory;
  196. const static THolder<NArgonish::IArgon2Base> argon2 = afactory.Create(
  197. NArgonish::EArgon2Type::Argon2d, 1, 32, 1);
  198. const TStringRef inputRef = args[0].AsStringRef();
  199. const TStringRef saltRef = args[1].AsStringRef();
  200. ui8 out[outSize];
  201. argon2->Hash(reinterpret_cast<const ui8*>(inputRef.Data()), inputRef.Size(),
  202. reinterpret_cast<const ui8*>(saltRef.Data()), saltRef.Size(),
  203. out, outSize);
  204. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(&out[0]), outSize));
  205. }
  206. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TBlake2B, char*(TAutoMap<char*>, TOptional<char*>), 1) {
  207. const static ui32 outSize = 32;
  208. const static NArgonish::TBlake2BFactory bfactory;
  209. const TStringRef inputRef = args[0].AsStringRef();
  210. THolder<NArgonish::IBlake2Base> blake2b;
  211. if (args[1]) {
  212. const TStringRef keyRef = args[1].AsStringRef();
  213. if (keyRef.Size() == 0) {
  214. blake2b = bfactory.Create(outSize);
  215. } else {
  216. blake2b = bfactory.Create(outSize, reinterpret_cast<const ui8*>(keyRef.Data()), keyRef.Size());
  217. }
  218. } else {
  219. blake2b = bfactory.Create(outSize);
  220. }
  221. ui8 out[outSize];
  222. blake2b->Update(inputRef.Data(), inputRef.Size());
  223. blake2b->Final(out, outSize);
  224. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(&out[0]), outSize));
  225. }
  226. SIMPLE_STRICT_UDF(TSipHash, ui64(ui64, ui64, TAutoMap<char*>)) {
  227. using namespace highwayhash;
  228. Y_UNUSED(valueBuilder);
  229. const TStringRef inputRef = args[2].AsStringRef();
  230. const HH_U64 state[2] = {args[0].Get<ui64>(), args[1].Get<ui64>()};
  231. ui64 hash = SipHash(state, inputRef.Data(), inputRef.Size());
  232. return TUnboxedValuePod(hash);
  233. }
  234. SIMPLE_STRICT_UDF(THighwayHash, ui64(ui64, ui64, ui64, ui64, TAutoMap<char*>)) {
  235. using namespace highwayhash;
  236. Y_UNUSED(valueBuilder);
  237. const TStringRef inputRef = args[4].AsStringRef();
  238. const uint64_t key[4] = {
  239. args[0].Get<ui64>(),
  240. args[1].Get<ui64>(),
  241. args[2].Get<ui64>(),
  242. args[3].Get<ui64>()};
  243. ui64 hash = HighwayHash64(key, inputRef.Data(), inputRef.Size());
  244. return TUnboxedValuePod(hash);
  245. }
  246. SIMPLE_STRICT_UDF(TFarmHashFingerprint, ui64(TAutoMap<ui64>)) {
  247. Y_UNUSED(valueBuilder);
  248. ui64 input = args[0].Get<ui64>();
  249. ui64 hash = util::Fingerprint(input);
  250. return TUnboxedValuePod(hash);
  251. }
  252. SIMPLE_STRICT_UDF(TFarmHashFingerprint2, ui64(TAutoMap<ui64>, TAutoMap<ui64>)) {
  253. Y_UNUSED(valueBuilder);
  254. ui64 low = args[0].Get<ui64>();
  255. ui64 high = args[1].Get<ui64>();
  256. ui64 hash = util::Fingerprint(util::Uint128(low, high));
  257. return TUnboxedValuePod(hash);
  258. }
  259. SIMPLE_STRICT_UDF(TFarmHashFingerprint32, ui32(TAutoMap<char*>)) {
  260. Y_UNUSED(valueBuilder);
  261. const auto& inputRef = args[0].AsStringRef();
  262. auto hash = util::Fingerprint32(inputRef.Data(), inputRef.Size());
  263. return TUnboxedValuePod(ui32(hash));
  264. }
  265. SIMPLE_STRICT_UDF(TFarmHashFingerprint64, ui64(TAutoMap<char*>)) {
  266. Y_UNUSED(valueBuilder);
  267. const auto& inputRef = args[0].AsStringRef();
  268. auto hash = util::Fingerprint64(inputRef.Data(), inputRef.Size());
  269. return TUnboxedValuePod(ui64(hash));
  270. }
  271. class TFarmHashFingerprint128: public TBoxedValue {
  272. public:
  273. static TStringRef Name() {
  274. static auto name = TStringRef::Of("FarmHashFingerprint128");
  275. return name;
  276. }
  277. static bool DeclareSignature(
  278. const TStringRef& name,
  279. TType* userType,
  280. IFunctionTypeInfoBuilder& builder,
  281. bool typesOnly) {
  282. Y_UNUSED(userType);
  283. if (Name() == name) {
  284. auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  285. builder.Args(1)->Add<TAutoMap<char*>>();
  286. builder.Returns(type);
  287. if (!typesOnly) {
  288. builder.Implementation(new TFarmHashFingerprint128);
  289. }
  290. builder.IsStrict();
  291. return true;
  292. } else {
  293. return false;
  294. }
  295. }
  296. private:
  297. TUnboxedValue Run(
  298. const IValueBuilder* valueBuilder,
  299. const TUnboxedValuePod* args) const override {
  300. TUnboxedValue* items = nullptr;
  301. auto val = valueBuilder->NewArray(2U, items);
  302. const auto& inputRef = args[0].AsStringRef();
  303. auto hash = util::Fingerprint128(inputRef.Data(), inputRef.Size());
  304. items[0] = TUnboxedValuePod(static_cast<ui64>(hash.first));
  305. items[1] = TUnboxedValuePod(static_cast<ui64>(hash.second));
  306. return val;
  307. }
  308. };
  309. SIMPLE_STRICT_UDF(TSuperFastHash, ui32(TAutoMap<char*>)) {
  310. Y_UNUSED(valueBuilder);
  311. const auto& inputRef = args[0].AsStringRef();
  312. ui32 hash = SuperFastHash(inputRef.Data(), inputRef.Size());
  313. return TUnboxedValuePod(hash);
  314. }
  315. SIMPLE_STRICT_UDF(TSha1, char*(TAutoMap<char*>)) {
  316. const auto& inputRef = args[0].AsStringRef();
  317. SHA_CTX sha;
  318. SHA1_Init(&sha);
  319. SHA1_Update(&sha, inputRef.Data(), inputRef.Size());
  320. unsigned char hash[SHA_DIGEST_LENGTH];
  321. SHA1_Final(hash, &sha);
  322. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(hash), sizeof(hash)));
  323. }
  324. SIMPLE_STRICT_UDF(TSha256, char*(TAutoMap<char*>)) {
  325. const auto& inputRef = args[0].AsStringRef();
  326. SHA256_CTX sha;
  327. SHA256_Init(&sha);
  328. SHA256_Update(&sha, inputRef.Data(), inputRef.Size());
  329. unsigned char hash[SHA256_DIGEST_LENGTH];
  330. SHA256_Final(hash, &sha);
  331. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(hash), sizeof(hash)));
  332. }
  333. SIMPLE_STRICT_UDF(TIntHash64, ui64(TAutoMap<ui64>)) {
  334. Y_UNUSED(valueBuilder);
  335. ui64 x = args[0].Get<ui64>();
  336. x ^= 0x4CF2D2BAAE6DA887ULL;
  337. x ^= x >> 33;
  338. x *= 0xff51afd7ed558ccdULL;
  339. x ^= x >> 33;
  340. x *= 0xc4ceb9fe1a85ec53ULL;
  341. x ^= x >> 33;
  342. return TUnboxedValuePod(x);
  343. }
  344. SIMPLE_STRICT_UDF(TXXH3, ui64(TAutoMap<char*>)) {
  345. Y_UNUSED(valueBuilder);
  346. const auto& inputRef = args[0].AsStringRef();
  347. const ui64 hash = XXH3_64bits(inputRef.Data(), inputRef.Size());
  348. return TUnboxedValuePod(hash);
  349. }
  350. class TXXH3_128: public TBoxedValue {
  351. public:
  352. static TStringRef Name() {
  353. static auto name = TStringRef::Of("XXH3_128");
  354. return name;
  355. }
  356. static bool DeclareSignature(const TStringRef& name, TType*, IFunctionTypeInfoBuilder& builder, bool typesOnly) {
  357. if (Name() == name) {
  358. const auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  359. builder.Args(1)->Add<TAutoMap<char*>>();
  360. builder.Returns(type);
  361. if (!typesOnly) {
  362. builder.Implementation(new TXXH3_128);
  363. }
  364. builder.IsStrict();
  365. return true;
  366. } else {
  367. return false;
  368. }
  369. }
  370. private:
  371. TUnboxedValue Run(const IValueBuilder* valueBuilder, const TUnboxedValuePod* args) const final {
  372. TUnboxedValue* items = nullptr;
  373. auto val = valueBuilder->NewArray(2U, items);
  374. const auto& inputRef = args[0].AsStringRef();
  375. const auto hash = XXH3_128bits(inputRef.Data(), inputRef.Size());
  376. items[0] = TUnboxedValuePod(ui64(hash.low64));
  377. items[1] = TUnboxedValuePod(ui64(hash.high64));
  378. return val;
  379. }
  380. };
  381. SIMPLE_MODULE(TDigestModule,
  382. TCrc32c,
  383. TCrc64,
  384. TFnv32,
  385. TFnv64,
  386. TMurMurHash,
  387. TMurMurHash32,
  388. TMurMurHash2A,
  389. TMurMurHash2A32,
  390. TCityHash,
  391. TCityHash128,
  392. TNumericHash,
  393. TMd5Hex,
  394. TMd5Raw,
  395. TMd5HalfMix,
  396. TArgon2,
  397. TBlake2B,
  398. TSipHash,
  399. THighwayHash,
  400. TFarmHashFingerprint,
  401. TFarmHashFingerprint2,
  402. TFarmHashFingerprint32,
  403. TFarmHashFingerprint64,
  404. TFarmHashFingerprint128,
  405. TSuperFastHash,
  406. TSha1,
  407. TSha256,
  408. TIntHash64,
  409. TXXH3,
  410. TXXH3_128
  411. )
  412. }
  413. REGISTER_MODULES(TDigestModule)