presort.cpp 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. #include <yql/essentials/minikql/invoke_builtins/mkql_builtins.h>
  2. #include <yql/essentials/minikql/defs.h>
  3. #include <yql/essentials/minikql/computation/presort.h>
  4. #include <yql/essentials/minikql/mkql_alloc.h>
  5. #include <yql/essentials/minikql/mkql_string_util.h>
  6. #include <yql/essentials/public/udf/udf_types.h>
  7. #include <library/cpp/presort/presort.h>
  8. #include <util/random/random.h>
  9. #include <util/datetime/cputimer.h>
  10. #include <util/string/builder.h>
  11. using namespace NKikimr;
  12. using namespace NKikimr::NMiniKQL;
  13. namespace {
  14. struct TSettings {
  15. ui32 Index;
  16. bool IsOptional;
  17. NKikimr::NUdf::EDataSlot Slot;
  18. };
  19. template <bool Desc>
  20. struct TPresortOps : public NPresort::TResultOps {
  21. const TVector<TSettings>& Settings;
  22. NUdf::TUnboxedValue* Items;
  23. size_t Current = 0;
  24. TPresortOps(
  25. const TVector<TSettings>& settings,
  26. NUdf::TUnboxedValue* items)
  27. : Settings(settings)
  28. , Items(items)
  29. {}
  30. void Encode(IOutputStream& out) {
  31. for (const auto& setting : Settings) {
  32. auto& value = Items[setting.Index];
  33. switch (setting.Slot) {
  34. case NUdf::EDataSlot::Bool:
  35. NPresort::EncodeUnsignedInt(out, value.template Get<bool>(), Desc);
  36. break;
  37. case NUdf::EDataSlot::Uint8:
  38. NPresort::EncodeUnsignedInt(out, value.template Get<ui8>(), Desc);
  39. break;
  40. case NUdf::EDataSlot::Uint16:
  41. case NUdf::EDataSlot::Date:
  42. NPresort::EncodeUnsignedInt(out, value.template Get<ui16>(), Desc);
  43. break;
  44. case NUdf::EDataSlot::Uint32:
  45. case NUdf::EDataSlot::Datetime:
  46. NPresort::EncodeUnsignedInt(out, value.template Get<ui32>(), Desc);
  47. break;
  48. case NUdf::EDataSlot::Uint64:
  49. case NUdf::EDataSlot::Timestamp:
  50. NPresort::EncodeUnsignedInt(out, value.template Get<ui64>(), Desc);
  51. break;
  52. case NUdf::EDataSlot::Int8:
  53. NPresort::EncodeSignedInt(out, value.template Get<i8>(), Desc);
  54. break;
  55. case NUdf::EDataSlot::Int16:
  56. NPresort::EncodeSignedInt(out, value.template Get<i16>(), Desc);
  57. break;
  58. case NUdf::EDataSlot::Int32:
  59. NPresort::EncodeSignedInt(out, value.template Get<i32>(), Desc);
  60. break;
  61. case NUdf::EDataSlot::Int64:
  62. case NUdf::EDataSlot::Interval:
  63. NPresort::EncodeSignedInt(out, value.template Get<i64>(), Desc);
  64. break;
  65. case NUdf::EDataSlot::Float:
  66. NPresort::EncodeFloating(out, value.template Get<float>(), Desc);
  67. break;
  68. case NUdf::EDataSlot::Double:
  69. NPresort::EncodeFloating(out, value.template Get<double>(), Desc);
  70. break;
  71. case NUdf::EDataSlot::String:
  72. case NUdf::EDataSlot::Utf8: {
  73. auto strRef = value.AsStringRef();
  74. NPresort::EncodeString(out, TStringBuf(strRef.Data(), strRef.Size()), Desc);
  75. break;
  76. }
  77. default:
  78. MKQL_ENSURE(false, TStringBuilder() << "Unknown slot: " << setting.Slot);
  79. }
  80. }
  81. }
  82. void SetError(const TString& err) {
  83. MKQL_ENSURE(false, TStringBuilder() << "Presort decoding error: " << err);
  84. }
  85. void SetUnsignedInt(ui64 value) {
  86. const auto& setting = Settings[Current++];
  87. switch (setting.Slot) {
  88. case NUdf::EDataSlot::Bool:
  89. Items[setting.Index] = NUdf::TUnboxedValuePod(value != 0);
  90. break;
  91. case NUdf::EDataSlot::Uint8:
  92. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<ui8>(value));
  93. break;
  94. case NUdf::EDataSlot::Uint16:
  95. case NUdf::EDataSlot::Date:
  96. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<ui16>(value));
  97. break;
  98. case NUdf::EDataSlot::Uint32:
  99. case NUdf::EDataSlot::Datetime:
  100. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<ui32>(value));
  101. break;
  102. case NUdf::EDataSlot::Uint64:
  103. case NUdf::EDataSlot::Timestamp:
  104. Items[setting.Index] = NUdf::TUnboxedValuePod(value);
  105. break;
  106. default:
  107. MKQL_ENSURE(false, TStringBuilder() << "Unknown slot: " << setting.Slot);
  108. }
  109. }
  110. void SetSignedInt(i64 value) {
  111. const auto& setting = Settings[Current++];
  112. switch (setting.Slot) {
  113. case NUdf::EDataSlot::Int8:
  114. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<i8>(value));
  115. break;
  116. case NUdf::EDataSlot::Int16:
  117. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<i16>(value));
  118. break;
  119. case NUdf::EDataSlot::Int32:
  120. Items[setting.Index] = NUdf::TUnboxedValuePod(static_cast<i32>(value));
  121. break;
  122. case NUdf::EDataSlot::Int64:
  123. case NUdf::EDataSlot::Interval:
  124. Items[setting.Index] = NUdf::TUnboxedValuePod(value);
  125. break;
  126. default:
  127. MKQL_ENSURE(false, "Unknown slot: " << setting.Slot);
  128. }
  129. }
  130. void SetFloat(float value) {
  131. Items[Settings[Current++].Index] = NUdf::TUnboxedValuePod(value);
  132. }
  133. void SetDouble(double value) {
  134. Items[Settings[Current++].Index] = NUdf::TUnboxedValuePod(value);
  135. }
  136. void SetString(const TString& value) {
  137. Items[Settings[Current++].Index] = MakeString(NUdf::TStringRef(value.data(), value.size()));
  138. }
  139. void SetOptional(bool) {}
  140. };
  141. template <typename T>
  142. NUdf::TUnboxedValue RandomValue() {
  143. return NUdf::TUnboxedValuePod(RandomNumber<T>());
  144. }
  145. template <>
  146. NUdf::TUnboxedValue RandomValue<char*>() {
  147. auto length = RandomNumber<ui64>(64);
  148. return MakeStringNotFilled(length);
  149. }
  150. template <typename T, NUdf::EDataSlot Slot, bool Desc>
  151. std::pair<ui64, ui64> MeasureOld() {
  152. constexpr size_t count = 1000;
  153. constexpr size_t rowCount = 100000;
  154. TScopedAlloc alloc(__LOCATION__);
  155. TMemoryUsageInfo memInfo("Memory");
  156. TVector<NUdf::TUnboxedValue> values;
  157. TVector<TSettings> settings;
  158. for (ui32 i = 0; i < count; ++i) {
  159. values.push_back(RandomValue<T>());
  160. settings.push_back({i, false, NUdf::TDataType<T>::Slot});
  161. }
  162. TSimpleTimer timer;
  163. TStringStream stream;
  164. for (size_t n = 0; n < rowCount; ++n) {
  165. stream.clear();
  166. TPresortOps<Desc> ops{settings, values.begin()};
  167. ops.Encode(stream);
  168. }
  169. auto encodeTime = timer.Get().MicroSeconds();
  170. auto rowSize = stream.Str().size();
  171. Cerr << "row size " << rowSize << ", row count " << rowCount << Endl;
  172. Cerr << "encoding " << rowSize * rowCount * 1000000 / encodeTime << " bytes per sec ("
  173. << encodeTime << " us)" << Endl;
  174. timer.Reset();
  175. for (size_t n = 0; n < rowCount; ++n) {
  176. TPresortOps<Desc> ops{settings, values.begin()};
  177. auto str = stream.Str();
  178. NPresort::Decode(ops, TStringBuf(str.data(), str.size()));
  179. }
  180. auto decodeTime = timer.Get().MicroSeconds();
  181. Cerr << "decoding " << rowSize * rowCount * 1000000 / decodeTime << " bytes per sec ("
  182. << decodeTime << " us)" << Endl;
  183. Cerr << Endl;
  184. return std::make_pair(encodeTime, decodeTime);
  185. }
  186. template <typename T, NUdf::EDataSlot Slot, bool Desc>
  187. std::pair<ui64, ui64> MeasureNew() {
  188. constexpr size_t count = 1000;
  189. constexpr size_t rowCount = 100000;
  190. TScopedAlloc alloc(__LOCATION__);
  191. TMemoryUsageInfo memInfo("Memory");
  192. TVector<NUdf::TUnboxedValuePod> values;
  193. TPresortEncoder encoder;
  194. TPresortDecoder decoder;
  195. for (size_t i = 0; i < count; ++i) {
  196. values.push_back(RandomValue<T>());
  197. encoder.AddType(Slot, false, Desc);
  198. decoder.AddType(Slot, false, Desc);
  199. }
  200. TSimpleTimer timer;
  201. TStringBuf buffer;
  202. for (size_t n = 0; n < rowCount; ++n) {
  203. encoder.Start();
  204. for (size_t i = 0; i < count; ++i) {
  205. encoder.Encode(values[i]);
  206. }
  207. buffer = encoder.Finish();
  208. }
  209. auto encodeTime = timer.Get().MicroSeconds();
  210. auto rowSize = buffer.size();
  211. Cerr << "row size " << rowSize << ", row count " << rowCount << Endl;
  212. Cerr << "encoding " << rowSize * rowCount * 1000000 / encodeTime << " bytes per sec ("
  213. << encodeTime << " us)" << Endl;
  214. timer.Reset();
  215. for (size_t n = 0; n < rowCount; ++n) {
  216. decoder.Start(buffer);
  217. for (size_t i = 0; i < count; ++i) {
  218. decoder.Decode();
  219. }
  220. encoder.Finish();
  221. }
  222. auto decodeTime = timer.Get().MicroSeconds();
  223. Cerr << "decoding " << rowSize * rowCount * 1000000 / decodeTime << " bytes per sec ("
  224. << decodeTime << " us)" << Endl;
  225. Cerr << Endl;
  226. return std::make_pair(encodeTime, decodeTime);
  227. }
  228. template <typename T, NUdf::EDataSlot Slot, bool Desc>
  229. void Compare() {
  230. auto newTimes = MeasureNew<T, Slot, Desc>();
  231. auto oldTimes = MeasureOld<T, Slot, Desc>();
  232. Cerr << "encoding speedup " << (double)oldTimes.first / (double)newTimes.first << Endl;
  233. Cerr << "decoding speedup " << (double)oldTimes.second / (double)newTimes.second << Endl;
  234. Cerr << "--------" << Endl << Endl;
  235. }
  236. template <typename T, NUdf::EDataSlot Slot>
  237. void CompareType(const char* type) {
  238. Cerr << type << Endl;
  239. Compare<T, Slot, false>();
  240. Cerr << type << " desc" << Endl;
  241. Compare<T, Slot, true>();
  242. }
  243. }
  244. int main(int, char**) {
  245. CompareType<bool, NUdf::EDataSlot::Bool>("bool");
  246. CompareType<ui8, NUdf::EDataSlot::Uint8>("ui8");
  247. CompareType<ui16, NUdf::EDataSlot::Uint16>("ui16");
  248. CompareType<ui32, NUdf::EDataSlot::Uint32>("ui32");
  249. CompareType<ui64, NUdf::EDataSlot::Uint64>("ui64");
  250. CompareType<float, NUdf::EDataSlot::Float>("float");
  251. CompareType<double, NUdf::EDataSlot::Double>("double");
  252. CompareType<char*, NUdf::EDataSlot::String>("string");
  253. return 0;
  254. }