presort.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. #pragma once
  2. #include <util/generic/bitops.h>
  3. #include <util/generic/maybe.h>
  4. #include <util/generic/strbuf.h>
  5. #include <util/generic/string.h>
  6. #include <util/stream/output.h>
  7. #include <cfloat>
  8. #include <cmath>
  9. // TODO: remove when switched to c++11 stl
  10. #if _LIBCPP_STD_VER >= 11
  11. #include <limits>
  12. #else
  13. #define PRESORT_FP_DISABLED
  14. #endif
  15. /*
  16. Serialization PREServing ORder of Tuples of numbers, strings and optional numbers or strings
  17. Lexicographic ordering of serialized tuples will be the same as of non-serialized
  18. Descending order is supported
  19. */
  20. namespace NPresort {
  21. namespace NImpl {
  22. enum ECode {
  23. StringEnd = 0x00,
  24. StringPart = 0x10,
  25. IntNeg = 0x20,
  26. IntNonNeg = 0x30,
  27. Unsigned = 0x40,
  28. Float = 0x50,
  29. Double = 0x60,
  30. Extension = 0x70,
  31. Descending = 0x80,
  32. };
  33. static const ui8 CodeMask = 0xf0;
  34. static const ui8 LengthMask = 0x0f;
  35. static const ui8 Optional = 0x01;
  36. static const ui8 OptionalFilled = 0x02;
  37. enum EFPCode {
  38. NegInf = 0x00,
  39. NegFar = 0x01,
  40. NegNear = 0x02,
  41. NegSub = 0x03,
  42. Zero = 0x04,
  43. PosSub = 0x05,
  44. PosNear = 0x06,
  45. PosFar = 0x07,
  46. PosInf = 0x08,
  47. Nan = 0x09,
  48. Disabled = 0x0f
  49. };
  50. static const ui8 FPCodeMask = 0x0f;
  51. static const size_t BlockLength = 15;
  52. static const size_t BufferLength = BlockLength + 1;
  53. static const float FloatSignificandBase = pow((float)FLT_RADIX, FLT_MANT_DIG);
  54. static const double DoubleSignificandBase = pow((double)FLT_RADIX, DBL_MANT_DIG);
  55. template <typename T>
  56. struct TSignificandBase {
  57. static double Value() {
  58. return DoubleSignificandBase;
  59. }
  60. };
  61. template <>
  62. struct TSignificandBase<float> {
  63. static float Value() {
  64. return FloatSignificandBase;
  65. }
  66. };
  67. struct TDecodeContext {
  68. ECode Code;
  69. TString Err;
  70. TString Str;
  71. ui32 StrBlocks = 0;
  72. i64 SignedVal = 0;
  73. ui64 UnsignedVal = 0;
  74. float FloatVal = 0;
  75. double DoubleVal = 0;
  76. bool Optional = false;
  77. bool Filled = false;
  78. };
  79. class TBlock {
  80. public:
  81. TBlock()
  82. : Len(0)
  83. {
  84. memset(Buf, 0, BufferLength);
  85. }
  86. void Put(IOutputStream& out) const {
  87. out.Write(Buf, Len);
  88. }
  89. ui8 GetLen() const {
  90. return Len;
  91. }
  92. void EncodeSignedInt(i64 val, bool desc) {
  93. const bool neg = val < 0;
  94. const ui8 bytes = val ? EncodeInt(neg ? -val : val) : 0;
  95. Set(neg ? ((~IntNeg) & CodeMask) : IntNonNeg, bytes, neg != desc);
  96. }
  97. void EncodeUnsignedInt(ui64 val, bool desc, bool end = false) {
  98. const ui8 bytes = val ? EncodeInt(val) : 0;
  99. Set(end ? StringEnd : Unsigned, bytes, desc);
  100. }
  101. bool EncodeFloating(float val, bool desc) {
  102. const EFPCode code = FPCode(val);
  103. Set(Float | code, 0, desc);
  104. return FPNeedEncodeValue(code);
  105. }
  106. bool EncodeFloating(double val, bool desc) {
  107. const EFPCode code = FPCode(val);
  108. Set(Double | code, 0, desc);
  109. return FPNeedEncodeValue(code);
  110. }
  111. void EncodeString(TStringBuf str, bool desc) {
  112. memcpy(Buf + 1, str.data(), str.size());
  113. Set(StringPart, BlockLength, desc);
  114. }
  115. void EncodeOptional(bool filled, bool desc) {
  116. Set(Extension | Optional | (filled ? OptionalFilled : 0), 0, desc);
  117. }
  118. bool Decode(TDecodeContext& ctx, TStringBuf str) {
  119. if (str.empty()) {
  120. ctx.Err = "No data";
  121. return false;
  122. }
  123. Len = 1;
  124. bool desc = false;
  125. ui8 byte = str[0];
  126. ui8 code = byte & CodeMask;
  127. if (code >= Descending) {
  128. desc = true;
  129. byte = ~byte;
  130. code = byte & CodeMask;
  131. }
  132. switch (code) {
  133. case StringPart:
  134. if (!Init(ctx, str, byte, desc)) {
  135. return false;
  136. }
  137. ctx.Str.append((const char*)Buf + 1, Len - 1);
  138. ++ctx.StrBlocks;
  139. break;
  140. case StringEnd: {
  141. if (!Init(ctx, str, byte, desc)) {
  142. return false;
  143. }
  144. const ui64 val = DecodeInt();
  145. if (val) {
  146. if (!ctx.StrBlocks) {
  147. ctx.Err = "Unexpected end of string";
  148. return false;
  149. }
  150. if (val > BlockLength) {
  151. ctx.Err = "Invalid string terminal";
  152. return false;
  153. }
  154. ctx.Str.erase(BlockLength * (ctx.StrBlocks - 1) + val);
  155. }
  156. ctx.StrBlocks = 0;
  157. break;
  158. }
  159. case IntNeg:
  160. if (!Init(ctx, str, ~byte, !desc)) {
  161. return false;
  162. }
  163. ctx.SignedVal = -(i64)DecodeInt();
  164. break;
  165. case IntNonNeg:
  166. if (!Init(ctx, str, byte, desc)) {
  167. return false;
  168. }
  169. ctx.SignedVal = DecodeInt();
  170. break;
  171. case Unsigned:
  172. if (!Init(ctx, str, byte, desc)) {
  173. return false;
  174. }
  175. ctx.UnsignedVal = DecodeInt();
  176. break;
  177. case Float:
  178. if (!DecodeFloating((EFPCode)(byte & FPCodeMask), ctx.FloatVal, ctx, str.Skip(Len))) {
  179. return false;
  180. }
  181. break;
  182. case Double:
  183. if (!DecodeFloating((EFPCode)(byte & FPCodeMask), ctx.DoubleVal, ctx, str.Skip(Len))) {
  184. return false;
  185. }
  186. break;
  187. case Extension:
  188. ctx.Optional = byte & Optional;
  189. ctx.Filled = byte & OptionalFilled;
  190. break;
  191. default:
  192. Y_ABORT("Invalid record code: %d", (int)code);
  193. }
  194. ctx.Code = (ECode)code;
  195. return true;
  196. }
  197. private:
  198. bool Init(TDecodeContext& ctx, TStringBuf str, ui8 byte, bool invert) {
  199. Len = (byte & LengthMask) + 1;
  200. if (Len > BufferLength) {
  201. ctx.Err = "Invalid block length";
  202. return false;
  203. }
  204. if (Len > str.size()) {
  205. ctx.Err = "Unexpected end of data";
  206. return false;
  207. }
  208. memcpy(Buf, str.data(), Len);
  209. if (invert) {
  210. Invert();
  211. }
  212. return true;
  213. }
  214. ui64 DecodeInt() const {
  215. ui64 val = 0;
  216. for (ui8 b = 1; b < Len; ++b) {
  217. const ui8 shift = Len - b - 1;
  218. if (shift < sizeof(ui64)) {
  219. val |= ((ui64)Buf[b]) << (8 * shift);
  220. }
  221. }
  222. return val;
  223. }
  224. ui8 EncodeInt(ui64 val) {
  225. const ui8 bytes = GetValueBitCount(val) / 8 + 1;
  226. for (ui8 b = 1; b <= bytes; ++b) {
  227. const ui8 shift = bytes - b;
  228. if (shift < sizeof(ui64)) {
  229. Buf[b] = val >> (8 * shift);
  230. }
  231. }
  232. return bytes;
  233. }
  234. static bool FPNeedEncodeValue(EFPCode code) {
  235. return code != Nan && code != Zero && code != NegInf && code != PosInf && code != Disabled;
  236. }
  237. template <typename T>
  238. static EFPCode FPCode(T val) {
  239. #ifdef PRESORT_FP_DISABLED
  240. Y_UNUSED(val);
  241. return Disabled;
  242. #else
  243. switch (std::fpclassify(val)) {
  244. case FP_INFINITE:
  245. return val < 0 ? NegInf : PosInf;
  246. case FP_NAN:
  247. return Nan;
  248. case FP_ZERO:
  249. return Zero;
  250. case FP_SUBNORMAL:
  251. return val < 0 ? NegSub : PosSub;
  252. case FP_NORMAL:
  253. break;
  254. }
  255. if (val < 0) {
  256. return val > -1 ? NegNear : NegFar;
  257. }
  258. return val < 1 ? PosNear : PosFar;
  259. #endif
  260. }
  261. template <typename T>
  262. bool DecodeFloating(EFPCode code, T& val, TDecodeContext& ctx, TStringBuf data) {
  263. #ifdef PRESORT_FP_DISABLED
  264. Y_UNUSED(code);
  265. Y_UNUSED(val);
  266. Y_UNUSED(data);
  267. ctx.Err = "Floating point numbers support is disabled";
  268. return false;
  269. #else
  270. switch (code) {
  271. case Zero:
  272. val = 0;
  273. return true;
  274. case NegInf:
  275. val = -std::numeric_limits<T>::infinity();
  276. return true;
  277. case PosInf:
  278. val = std::numeric_limits<T>::infinity();
  279. return true;
  280. case Nan:
  281. val = std::numeric_limits<T>::quiet_NaN();
  282. return true;
  283. case Disabled:
  284. ctx.Err = "Floating point numbers support was disabled on encoding";
  285. return false;
  286. default:
  287. break;
  288. }
  289. i64 exp = 0;
  290. i64 sig = 0;
  291. if (!DecodeFloatingPart(exp, ctx, data) || !DecodeFloatingPart(sig, ctx, data)) {
  292. return false;
  293. }
  294. val = ldexp(sig / TSignificandBase<T>::Value(), exp);
  295. return true;
  296. #endif
  297. }
  298. bool DecodeFloatingPart(i64& val, TDecodeContext& ctx, TStringBuf& data) {
  299. TBlock block;
  300. if (!block.Decode(ctx, data)) {
  301. return false;
  302. }
  303. if (ctx.Code != IntNeg && ctx.Code != IntNonNeg) {
  304. ctx.Err = "Invalid floating part";
  305. return false;
  306. }
  307. val = ctx.SignedVal;
  308. ctx.SignedVal = 0;
  309. data.Skip(block.GetLen());
  310. Len += block.GetLen();
  311. return true;
  312. }
  313. void Set(ui8 code, ui8 size, bool invert) {
  314. Y_ASSERT(size <= BlockLength);
  315. Buf[0] = code | size;
  316. Len = size + 1;
  317. if (invert) {
  318. Invert();
  319. }
  320. }
  321. void Invert() {
  322. Y_ASSERT(Len <= BufferLength);
  323. for (ui8 b = 0; b < Len; ++b) {
  324. Buf[b] = ~Buf[b];
  325. }
  326. }
  327. private:
  328. ui8 Buf[BufferLength];
  329. ui8 Len;
  330. };
  331. }
  332. inline void EncodeSignedInt(IOutputStream& out, i64 val, bool desc = false) {
  333. NImpl::TBlock block;
  334. block.EncodeSignedInt(val, desc);
  335. block.Put(out);
  336. }
  337. inline void EncodeUnsignedInt(IOutputStream& out, ui64 val, bool desc = false) {
  338. NImpl::TBlock block;
  339. block.EncodeUnsignedInt(val, desc);
  340. block.Put(out);
  341. }
  342. template <typename T>
  343. inline void EncodeFloating(IOutputStream& out, T val, bool desc = false) {
  344. NImpl::TBlock head;
  345. const bool encodeValue = head.EncodeFloating(val, desc);
  346. head.Put(out);
  347. if (encodeValue) {
  348. int exponent = 0;
  349. i64 significand = 0;
  350. significand = frexp(val, &exponent) * NImpl::TSignificandBase<T>::Value();
  351. NImpl::TBlock exp;
  352. exp.EncodeSignedInt(exponent, desc);
  353. exp.Put(out);
  354. NImpl::TBlock sig;
  355. sig.EncodeSignedInt(significand, desc);
  356. sig.Put(out);
  357. }
  358. }
  359. inline void EncodeString(IOutputStream& out, TStringBuf str, bool desc = false) {
  360. size_t part = 0;
  361. while (!str.empty()) {
  362. part = Min(str.size(), NImpl::BlockLength);
  363. NImpl::TBlock block;
  364. block.EncodeString(str.Head(part), desc);
  365. block.Put(out);
  366. str.Skip(part);
  367. }
  368. // Encode string end token
  369. NImpl::TBlock end;
  370. end.EncodeUnsignedInt(part, desc, true);
  371. end.Put(out);
  372. }
  373. template <bool Signed>
  374. struct TEncodeInt {
  375. static void Do(IOutputStream& out, i64 val, bool desc) {
  376. EncodeSignedInt(out, val, desc);
  377. }
  378. };
  379. template <>
  380. struct TEncodeInt<false> {
  381. static void Do(IOutputStream& out, ui64 val, bool desc) {
  382. EncodeUnsignedInt(out, val, desc);
  383. }
  384. };
  385. template <typename T, bool Integral>
  386. struct TEncodeNumber {
  387. static void Do(IOutputStream& out, const T& val, bool desc) {
  388. TEncodeInt<std::is_signed<T>::value>::Do(out, val, desc);
  389. }
  390. };
  391. template <typename T>
  392. struct TEncodeNumber<T, false> {
  393. static void Do(IOutputStream& out, const T& val, bool desc) {
  394. EncodeFloating(out, val, desc);
  395. }
  396. };
  397. template <typename T, bool Arithmetic>
  398. struct TEncodeValue {
  399. static void Do(IOutputStream& out, const T& val, bool desc) {
  400. TEncodeNumber<T, std::is_integral<T>::value>::Do(out, val, desc);
  401. }
  402. };
  403. template <typename T>
  404. struct TEncodeValue<T, false> {
  405. static void Do(IOutputStream& out, TStringBuf str, bool desc) {
  406. EncodeString(out, str, desc);
  407. }
  408. };
  409. template <typename T>
  410. static void Encode(IOutputStream& out, const T& val, bool desc = false) {
  411. TEncodeValue<T, std::is_arithmetic<T>::value>::Do(out, val, desc);
  412. }
  413. template <typename T>
  414. inline void EncodeOptional(IOutputStream& out, const TMaybe<T>& val, bool desc = false) {
  415. NImpl::TBlock block;
  416. block.EncodeOptional(val.Defined(), desc);
  417. block.Put(out);
  418. if (val.Defined()) {
  419. Encode(out, *val, desc);
  420. }
  421. }
  422. template <typename T>
  423. static void Encode(IOutputStream& out, const TMaybe<T>& val, bool desc = false) {
  424. EncodeOptional(out, val, desc);
  425. }
  426. struct TResultOps {
  427. void SetError(const TString&) {
  428. return;
  429. }
  430. void SetSignedInt(i64) {
  431. return;
  432. }
  433. void SetUnsignedInt(ui64) {
  434. return;
  435. }
  436. void SetFloat(float) {
  437. return;
  438. }
  439. void SetDouble(double) {
  440. return;
  441. }
  442. void SetString(const TString&) {
  443. return;
  444. }
  445. void SetOptional(bool) {
  446. return;
  447. }
  448. };
  449. template <typename TResult>
  450. bool Decode(TResult& res, TStringBuf data) {
  451. static_assert(std::is_base_of<TResultOps, TResult>::value, "Result must be derived from NPresort::TResultOps");
  452. using namespace NImpl;
  453. TDecodeContext ctx;
  454. while (!data.empty()) {
  455. TBlock block;
  456. if (!block.Decode(ctx, data)) {
  457. res.SetError(ctx.Err);
  458. return false;
  459. }
  460. if (ctx.StrBlocks && ctx.Code != StringPart) {
  461. res.SetError("Unexpected integer");
  462. return false;
  463. }
  464. switch (ctx.Code) {
  465. case StringEnd:
  466. res.SetString(ctx.Str);
  467. ctx.Str = TString();
  468. break;
  469. case IntNeg:
  470. case IntNonNeg:
  471. res.SetSignedInt(ctx.SignedVal);
  472. ctx.SignedVal = 0;
  473. break;
  474. case Unsigned:
  475. res.SetUnsignedInt(ctx.UnsignedVal);
  476. ctx.UnsignedVal = 0;
  477. break;
  478. case Float:
  479. res.SetFloat(ctx.FloatVal);
  480. ctx.FloatVal = 0;
  481. break;
  482. case Double:
  483. res.SetDouble(ctx.DoubleVal);
  484. ctx.DoubleVal = 0;
  485. break;
  486. case Extension:
  487. if (ctx.Optional) {
  488. res.SetOptional(ctx.Filled);
  489. ctx.Optional = false;
  490. ctx.Filled = false;
  491. }
  492. break;
  493. default:
  494. break;
  495. }
  496. data.Skip(block.GetLen());
  497. }
  498. return true;
  499. }
  500. }