digest_udf.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. #include <yql/essentials/public/udf/udf_helpers.h>
  2. #include <yql/essentials/public/udf/udf_value_builder.h>
  3. #include <util/digest/murmur.h>
  4. #include <util/digest/city.h>
  5. #include <util/digest/numeric.h>
  6. #include <util/digest/fnv.h>
  7. #include <library/cpp/digest/argonish/argon2.h>
  8. #include <library/cpp/digest/argonish/blake2b.h>
  9. #include <library/cpp/digest/crc32c/crc32c.h>
  10. #include <library/cpp/digest/md5/md5.h>
  11. #include <library/cpp/digest/murmur/murmur.h>
  12. #include <library/cpp/digest/old_crc/crc.h>
  13. #include <library/cpp/digest/sfh/sfh.h>
  14. #include <contrib/libs/highwayhash/highwayhash/c_bindings.h>
  15. #include <contrib/libs/highwayhash/highwayhash/sip_hash.h>
  16. #include <contrib/libs/farmhash/farmhash.h>
  17. #include <contrib/libs/xxhash/xxhash.h>
  18. #include <openssl/sha.h>
  19. using namespace NKikimr;
  20. using namespace NUdf;
  21. namespace {
  22. SIMPLE_STRICT_UDF(TCrc32c, ui32(TAutoMap<char*>)) {
  23. Y_UNUSED(valueBuilder);
  24. const auto& inputRef = args[0].AsStringRef();
  25. ui32 hash = Crc32c(inputRef.Data(), inputRef.Size());
  26. return TUnboxedValuePod(hash);
  27. }
  28. SIMPLE_STRICT_UDF(TCrc64, ui64(TAutoMap<char*>)) {
  29. Y_UNUSED(valueBuilder);
  30. const auto& inputRef = args[0].AsStringRef();
  31. ui64 hash = crc64(inputRef.Data(), inputRef.Size());
  32. return TUnboxedValuePod(hash);
  33. }
  34. SIMPLE_STRICT_UDF(TFnv32, ui32(TAutoMap<char*>)) {
  35. Y_UNUSED(valueBuilder);
  36. const auto& inputRef = args[0].AsStringRef();
  37. ui32 hash = FnvHash<ui32>(inputRef.Data(), inputRef.Size());
  38. return TUnboxedValuePod(hash);
  39. }
  40. SIMPLE_STRICT_UDF(TFnv64, ui64(TAutoMap<char*>)) {
  41. Y_UNUSED(valueBuilder);
  42. const auto& inputRef = args[0].AsStringRef();
  43. ui64 hash = FnvHash<ui64>(inputRef.Data(), inputRef.Size());
  44. return TUnboxedValuePod(hash);
  45. }
  46. SIMPLE_STRICT_UDF(TMurMurHash, ui64(TAutoMap<char*>)) {
  47. Y_UNUSED(valueBuilder);
  48. const auto& inputRef = args[0].AsStringRef();
  49. ui64 hash = MurmurHash<ui64>(inputRef.Data(), inputRef.Size());
  50. return TUnboxedValuePod(hash);
  51. }
  52. SIMPLE_STRICT_UDF(TMurMurHash32, ui32(TAutoMap<char*>)) {
  53. Y_UNUSED(valueBuilder);
  54. const auto& inputRef = args[0].AsStringRef();
  55. ui32 hash = MurmurHash<ui32>(inputRef.Data(), inputRef.Size());
  56. return TUnboxedValuePod(hash);
  57. }
  58. SIMPLE_STRICT_UDF(TMurMurHash2A, ui64(TAutoMap<char*>)) {
  59. Y_UNUSED(valueBuilder);
  60. const auto& inputRef = args[0].AsStringRef();
  61. ui64 hash = TMurmurHash2A<ui64>{}.Update(inputRef.Data(), inputRef.Size()).Value();
  62. return TUnboxedValuePod(hash);
  63. }
  64. SIMPLE_STRICT_UDF(TMurMurHash2A32, ui32(TAutoMap<char*>)) {
  65. Y_UNUSED(valueBuilder);
  66. const auto& inputRef = args[0].AsStringRef();
  67. ui32 hash = TMurmurHash2A<ui32>{}.Update(inputRef.Data(), inputRef.Size()).Value();
  68. return TUnboxedValuePod(hash);
  69. }
  70. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TCityHash, ui64(TAutoMap<char*>, TOptional<ui64>), 1) {
  71. Y_UNUSED(valueBuilder);
  72. const auto& inputRef = args[0].AsStringRef();
  73. ui64 hash;
  74. if (args[1]) {
  75. hash = CityHash64WithSeed(inputRef.Data(), inputRef.Size(), args[1].Get<ui64>());
  76. } else {
  77. hash = CityHash64(inputRef.Data(), inputRef.Size());
  78. }
  79. return TUnboxedValuePod(hash);
  80. }
  81. using TUi64Pair = NUdf::TTuple<ui64, ui64>;
  82. class TCityHash128: public TBoxedValue {
  83. public:
  84. static TStringRef Name() {
  85. static auto name = TStringRef::Of("CityHash128");
  86. return name;
  87. }
  88. static bool DeclareSignature(
  89. const TStringRef& name,
  90. TType* userType,
  91. IFunctionTypeInfoBuilder& builder,
  92. bool typesOnly) {
  93. Y_UNUSED(userType);
  94. if (Name() == name) {
  95. auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  96. builder.Args(1)->Add<TAutoMap<char*>>();
  97. builder.Returns(type);
  98. if (!typesOnly) {
  99. builder.Implementation(new TCityHash128);
  100. }
  101. builder.IsStrict();
  102. return true;
  103. } else {
  104. return false;
  105. }
  106. }
  107. private:
  108. TUnboxedValue Run(
  109. const IValueBuilder* valueBuilder,
  110. const TUnboxedValuePod* args) const override {
  111. TUnboxedValue* items = nullptr;
  112. auto val = valueBuilder->NewArray(2U, items);
  113. const auto& inputRef = args[0].AsStringRef();
  114. uint128 hash = CityHash128(inputRef.Data(), inputRef.Size());
  115. items[0] = TUnboxedValuePod(hash.first);
  116. items[1] = TUnboxedValuePod(hash.second);
  117. return val;
  118. }
  119. };
  120. SIMPLE_STRICT_UDF(TNumericHash, ui64(TAutoMap<ui64>)) {
  121. Y_UNUSED(valueBuilder);
  122. ui64 input = args[0].Get<ui64>();
  123. ui64 hash = (ui64)NumericHash(input);
  124. return TUnboxedValuePod(hash);
  125. }
  126. SIMPLE_STRICT_UDF(TMd5Hex, char*(TAutoMap<char*>)) {
  127. const auto& inputRef = args[0].AsStringRef();
  128. MD5 md5;
  129. const TString& hash = md5.Calc(inputRef);
  130. return valueBuilder->NewString(hash);
  131. }
  132. SIMPLE_STRICT_UDF(TMd5Raw, char*(TAutoMap<char*>)) {
  133. const auto& inputRef = args[0].AsStringRef();
  134. MD5 md5;
  135. const TString& hash = md5.CalcRaw(inputRef);
  136. return valueBuilder->NewString(hash);
  137. }
  138. SIMPLE_STRICT_UDF(TMd5HalfMix, ui64(TAutoMap<char*>)) {
  139. Y_UNUSED(valueBuilder);
  140. return TUnboxedValuePod(MD5::CalcHalfMix(args[0].AsStringRef()));
  141. }
  142. SIMPLE_STRICT_UDF(TArgon2, char*(TAutoMap<char*>, TAutoMap<char*>)) {
  143. const static ui32 outSize = 32;
  144. const static NArgonish::TArgon2Factory afactory;
  145. const static THolder<NArgonish::IArgon2Base> argon2 = afactory.Create(
  146. NArgonish::EArgon2Type::Argon2d, 1, 32, 1);
  147. const TStringRef inputRef = args[0].AsStringRef();
  148. const TStringRef saltRef = args[1].AsStringRef();
  149. ui8 out[outSize];
  150. argon2->Hash(reinterpret_cast<const ui8*>(inputRef.Data()), inputRef.Size(),
  151. reinterpret_cast<const ui8*>(saltRef.Data()), saltRef.Size(),
  152. out, outSize);
  153. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(&out[0]), outSize));
  154. }
  155. SIMPLE_STRICT_UDF_WITH_OPTIONAL_ARGS(TBlake2B, char*(TAutoMap<char*>, TOptional<char*>), 1) {
  156. const static ui32 outSize = 32;
  157. const static NArgonish::TBlake2BFactory bfactory;
  158. const TStringRef inputRef = args[0].AsStringRef();
  159. THolder<NArgonish::IBlake2Base> blake2b;
  160. if (args[1]) {
  161. const TStringRef keyRef = args[1].AsStringRef();
  162. if (keyRef.Size() == 0) {
  163. blake2b = bfactory.Create(outSize);
  164. } else {
  165. blake2b = bfactory.Create(outSize, reinterpret_cast<const ui8*>(keyRef.Data()), keyRef.Size());
  166. }
  167. } else {
  168. blake2b = bfactory.Create(outSize);
  169. }
  170. ui8 out[outSize];
  171. blake2b->Update(inputRef.Data(), inputRef.Size());
  172. blake2b->Final(out, outSize);
  173. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(&out[0]), outSize));
  174. }
  175. SIMPLE_STRICT_UDF(TSipHash, ui64(ui64, ui64, TAutoMap<char*>)) {
  176. using namespace highwayhash;
  177. Y_UNUSED(valueBuilder);
  178. const TStringRef inputRef = args[2].AsStringRef();
  179. const HH_U64 state[2] = {args[0].Get<ui64>(), args[1].Get<ui64>()};
  180. ui64 hash = SipHash(state, inputRef.Data(), inputRef.Size());
  181. return TUnboxedValuePod(hash);
  182. }
  183. SIMPLE_STRICT_UDF(THighwayHash, ui64(ui64, ui64, ui64, ui64, TAutoMap<char*>)) {
  184. using namespace highwayhash;
  185. Y_UNUSED(valueBuilder);
  186. const TStringRef inputRef = args[4].AsStringRef();
  187. const uint64_t key[4] = {
  188. args[0].Get<ui64>(),
  189. args[1].Get<ui64>(),
  190. args[2].Get<ui64>(),
  191. args[3].Get<ui64>()};
  192. ui64 hash = HighwayHash64(key, inputRef.Data(), inputRef.Size());
  193. return TUnboxedValuePod(hash);
  194. }
  195. SIMPLE_STRICT_UDF(TFarmHashFingerprint, ui64(TAutoMap<ui64>)) {
  196. Y_UNUSED(valueBuilder);
  197. ui64 input = args[0].Get<ui64>();
  198. ui64 hash = util::Fingerprint(input);
  199. return TUnboxedValuePod(hash);
  200. }
  201. SIMPLE_STRICT_UDF(TFarmHashFingerprint2, ui64(TAutoMap<ui64>, TAutoMap<ui64>)) {
  202. Y_UNUSED(valueBuilder);
  203. ui64 low = args[0].Get<ui64>();
  204. ui64 high = args[1].Get<ui64>();
  205. ui64 hash = util::Fingerprint(util::Uint128(low, high));
  206. return TUnboxedValuePod(hash);
  207. }
  208. SIMPLE_STRICT_UDF(TFarmHashFingerprint32, ui32(TAutoMap<char*>)) {
  209. Y_UNUSED(valueBuilder);
  210. const auto& inputRef = args[0].AsStringRef();
  211. auto hash = util::Fingerprint32(inputRef.Data(), inputRef.Size());
  212. return TUnboxedValuePod(ui32(hash));
  213. }
  214. SIMPLE_STRICT_UDF(TFarmHashFingerprint64, ui64(TAutoMap<char*>)) {
  215. Y_UNUSED(valueBuilder);
  216. const auto& inputRef = args[0].AsStringRef();
  217. auto hash = util::Fingerprint64(inputRef.Data(), inputRef.Size());
  218. return TUnboxedValuePod(ui64(hash));
  219. }
  220. class TFarmHashFingerprint128: public TBoxedValue {
  221. public:
  222. static TStringRef Name() {
  223. static auto name = TStringRef::Of("FarmHashFingerprint128");
  224. return name;
  225. }
  226. static bool DeclareSignature(
  227. const TStringRef& name,
  228. TType* userType,
  229. IFunctionTypeInfoBuilder& builder,
  230. bool typesOnly) {
  231. Y_UNUSED(userType);
  232. if (Name() == name) {
  233. auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  234. builder.Args(1)->Add<TAutoMap<char*>>();
  235. builder.Returns(type);
  236. if (!typesOnly) {
  237. builder.Implementation(new TFarmHashFingerprint128);
  238. }
  239. builder.IsStrict();
  240. return true;
  241. } else {
  242. return false;
  243. }
  244. }
  245. private:
  246. TUnboxedValue Run(
  247. const IValueBuilder* valueBuilder,
  248. const TUnboxedValuePod* args) const override {
  249. TUnboxedValue* items = nullptr;
  250. auto val = valueBuilder->NewArray(2U, items);
  251. const auto& inputRef = args[0].AsStringRef();
  252. auto hash = util::Fingerprint128(inputRef.Data(), inputRef.Size());
  253. items[0] = TUnboxedValuePod(static_cast<ui64>(hash.first));
  254. items[1] = TUnboxedValuePod(static_cast<ui64>(hash.second));
  255. return val;
  256. }
  257. };
  258. SIMPLE_STRICT_UDF(TSuperFastHash, ui32(TAutoMap<char*>)) {
  259. Y_UNUSED(valueBuilder);
  260. const auto& inputRef = args[0].AsStringRef();
  261. ui32 hash = SuperFastHash(inputRef.Data(), inputRef.Size());
  262. return TUnboxedValuePod(hash);
  263. }
  264. SIMPLE_STRICT_UDF(TSha1, char*(TAutoMap<char*>)) {
  265. const auto& inputRef = args[0].AsStringRef();
  266. SHA_CTX sha;
  267. SHA1_Init(&sha);
  268. SHA1_Update(&sha, inputRef.Data(), inputRef.Size());
  269. unsigned char hash[SHA_DIGEST_LENGTH];
  270. SHA1_Final(hash, &sha);
  271. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(hash), sizeof(hash)));
  272. }
  273. SIMPLE_STRICT_UDF(TSha256, char*(TAutoMap<char*>)) {
  274. const auto& inputRef = args[0].AsStringRef();
  275. SHA256_CTX sha;
  276. SHA256_Init(&sha);
  277. SHA256_Update(&sha, inputRef.Data(), inputRef.Size());
  278. unsigned char hash[SHA256_DIGEST_LENGTH];
  279. SHA256_Final(hash, &sha);
  280. return valueBuilder->NewString(TStringRef(reinterpret_cast<char*>(hash), sizeof(hash)));
  281. }
  282. SIMPLE_STRICT_UDF(TIntHash64, ui64(TAutoMap<ui64>)) {
  283. Y_UNUSED(valueBuilder);
  284. ui64 x = args[0].Get<ui64>();
  285. x ^= 0x4CF2D2BAAE6DA887ULL;
  286. x ^= x >> 33;
  287. x *= 0xff51afd7ed558ccdULL;
  288. x ^= x >> 33;
  289. x *= 0xc4ceb9fe1a85ec53ULL;
  290. x ^= x >> 33;
  291. return TUnboxedValuePod(x);
  292. }
  293. SIMPLE_STRICT_UDF(TXXH3, ui64(TAutoMap<char*>)) {
  294. Y_UNUSED(valueBuilder);
  295. const auto& inputRef = args[0].AsStringRef();
  296. const ui64 hash = XXH3_64bits(inputRef.Data(), inputRef.Size());
  297. return TUnboxedValuePod(hash);
  298. }
  299. class TXXH3_128: public TBoxedValue {
  300. public:
  301. static TStringRef Name() {
  302. static auto name = TStringRef::Of("XXH3_128");
  303. return name;
  304. }
  305. static bool DeclareSignature(const TStringRef& name, TType*, IFunctionTypeInfoBuilder& builder, bool typesOnly) {
  306. if (Name() == name) {
  307. const auto type = builder.Tuple(2)->Add<ui64>().Add<ui64>().Build();
  308. builder.Args(1)->Add<TAutoMap<char*>>();
  309. builder.Returns(type);
  310. if (!typesOnly) {
  311. builder.Implementation(new TXXH3_128);
  312. }
  313. builder.IsStrict();
  314. return true;
  315. } else {
  316. return false;
  317. }
  318. }
  319. private:
  320. TUnboxedValue Run(const IValueBuilder* valueBuilder, const TUnboxedValuePod* args) const final {
  321. TUnboxedValue* items = nullptr;
  322. auto val = valueBuilder->NewArray(2U, items);
  323. const auto& inputRef = args[0].AsStringRef();
  324. const auto hash = XXH3_128bits(inputRef.Data(), inputRef.Size());
  325. items[0] = TUnboxedValuePod(ui64(hash.low64));
  326. items[1] = TUnboxedValuePod(ui64(hash.high64));
  327. return val;
  328. }
  329. };
  330. SIMPLE_MODULE(TDigestModule,
  331. TCrc32c,
  332. TCrc64,
  333. TFnv32,
  334. TFnv64,
  335. TMurMurHash,
  336. TMurMurHash32,
  337. TMurMurHash2A,
  338. TMurMurHash2A32,
  339. TCityHash,
  340. TCityHash128,
  341. TNumericHash,
  342. TMd5Hex,
  343. TMd5Raw,
  344. TMd5HalfMix,
  345. TArgon2,
  346. TBlake2B,
  347. TSipHash,
  348. THighwayHash,
  349. TFarmHashFingerprint,
  350. TFarmHashFingerprint2,
  351. TFarmHashFingerprint32,
  352. TFarmHashFingerprint64,
  353. TFarmHashFingerprint128,
  354. TSuperFastHash,
  355. TSha1,
  356. TSha256,
  357. TIntHash64,
  358. TXXH3,
  359. TXXH3_128
  360. )
  361. }
  362. REGISTER_MODULES(TDigestModule)