Statistics.hh 48 KB


  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. #ifndef ORC_STATISTICS_IMPL_HH
  19. #define ORC_STATISTICS_IMPL_HH
  20. #include "orc/Common.hh"
  21. #include "orc/Int128.hh"
  22. #include "orc/OrcFile.hh"
  23. #include "orc/Reader.hh"
  24. #include "Timezone.hh"
  25. #include "TypeImpl.hh"
  26. namespace orc {
  27. /**
  28. * StatContext contains fields required to compute statistics
  29. */
  30. struct StatContext {
  31. const bool correctStats;
  32. const Timezone* const writerTimezone;
  33. StatContext() : correctStats(false), writerTimezone(nullptr) {}
  34. StatContext(bool cStat, const Timezone* const timezone = nullptr)
  35. : correctStats(cStat), writerTimezone(timezone) {}
  36. };
  37. /**
  38. * Internal Statistics Implementation
  39. */
  40. template <typename T>
  41. class InternalStatisticsImpl {
  42. private:
  43. bool _hasNull;
  44. bool _hasMinimum;
  45. bool _hasMaximum;
  46. bool _hasSum;
  47. bool _hasTotalLength;
  48. uint64_t _totalLength;
  49. uint64_t _valueCount;
  50. T _minimum;
  51. T _maximum;
  52. T _sum;
  53. public:
  54. InternalStatisticsImpl() {
  55. _hasNull = false;
  56. _hasMinimum = false;
  57. _hasMaximum = false;
  58. _hasSum = false;
  59. _hasTotalLength = false;
  60. _totalLength = 0;
  61. _valueCount = 0;
  62. }
  63. ~InternalStatisticsImpl() {}
  64. // GET / SET _totalLength
  65. bool hasTotalLength() const {
  66. return _hasTotalLength;
  67. }
  68. void setHasTotalLength(bool hasTotalLength) {
  69. _hasTotalLength = hasTotalLength;
  70. }
  71. uint64_t getTotalLength() const {
  72. return _totalLength;
  73. }
  74. void setTotalLength(uint64_t totalLength) {
  75. _totalLength = totalLength;
  76. }
  77. // GET / SET _sum
  78. bool hasSum() const {
  79. return _hasSum;
  80. }
  81. void setHasSum(bool hasSum) {
  82. _hasSum = hasSum;
  83. }
  84. T getSum() const {
  85. return _sum;
  86. }
  87. void setSum(T sum) {
  88. _sum = sum;
  89. }
  90. // GET / SET _maximum
  91. bool hasMaximum() const {
  92. return _hasMaximum;
  93. }
  94. const T& getMaximum() const {
  95. return _maximum;
  96. }
  97. void setHasMaximum(bool hasMax) {
  98. _hasMaximum = hasMax;
  99. }
  100. void setMaximum(T max) {
  101. _maximum = max;
  102. }
  103. // GET / SET _minimum
  104. bool hasMinimum() const {
  105. return _hasMinimum;
  106. }
  107. void setHasMinimum(bool hasMin) {
  108. _hasMinimum = hasMin;
  109. }
  110. const T& getMinimum() const {
  111. return _minimum;
  112. }
  113. void setMinimum(T min) {
  114. _minimum = min;
  115. }
  116. // GET / SET _valueCount
  117. uint64_t getNumberOfValues() const {
  118. return _valueCount;
  119. }
  120. void setNumberOfValues(uint64_t numValues) {
  121. _valueCount = numValues;
  122. }
  123. // GET / SET _hasNullValue
  124. bool hasNull() const {
  125. return _hasNull;
  126. }
  127. void setHasNull(bool hasNull) {
  128. _hasNull = hasNull;
  129. }
  130. void reset() {
  131. _hasNull = false;
  132. _hasMinimum = false;
  133. _hasMaximum = false;
  134. _hasSum = false;
  135. _hasTotalLength = false;
  136. _totalLength = 0;
  137. _valueCount = 0;
  138. }
  139. void updateMinMax(T value) {
  140. if (!_hasMinimum) {
  141. _hasMinimum = _hasMaximum = true;
  142. _minimum = _maximum = value;
  143. } else if (compare(value, _minimum)) {
  144. _minimum = value;
  145. } else if (compare(_maximum, value)) {
  146. _maximum = value;
  147. }
  148. }
  149. // sum is not merged here as we need to check overflow
  150. void merge(const InternalStatisticsImpl& other) {
  151. _hasNull = _hasNull || other._hasNull;
  152. _valueCount += other._valueCount;
  153. if (other._hasMinimum) {
  154. if (!_hasMinimum) {
  155. _hasMinimum = _hasMaximum = true;
  156. _minimum = other._minimum;
  157. _maximum = other._maximum;
  158. } else {
  159. // all template types should support operator<
  160. if (compare(_maximum, other._maximum)) {
  161. _maximum = other._maximum;
  162. }
  163. if (compare(other._minimum, _minimum)) {
  164. _minimum = other._minimum;
  165. }
  166. }
  167. }
  168. _hasTotalLength = _hasTotalLength && other._hasTotalLength;
  169. _totalLength += other._totalLength;
  170. }
  171. };
  172. typedef InternalStatisticsImpl<char> InternalCharStatistics;
  173. typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
  174. typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
  175. typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
  176. typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
  177. typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
  178. typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
  179. typedef InternalStatisticsImpl<uint64_t> InternalCollectionStatistics;
  180. /**
  181. * Mutable column statistics for use by the writer.
  182. */
  183. class MutableColumnStatistics {
  184. public:
  185. virtual ~MutableColumnStatistics();
  186. virtual void increase(uint64_t count) = 0;
  187. virtual void setNumberOfValues(uint64_t value) = 0;
  188. virtual void setHasNull(bool hasNull) = 0;
  189. virtual void merge(const MutableColumnStatistics& other) = 0;
  190. virtual void reset() = 0;
  191. virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
  192. };
  193. /**
  194. * ColumnStatistics Implementation
  195. */
  196. class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics {
  197. private:
  198. InternalCharStatistics _stats;
  199. public:
  200. ColumnStatisticsImpl() {
  201. reset();
  202. }
  203. ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
  204. virtual ~ColumnStatisticsImpl() override;
  205. uint64_t getNumberOfValues() const override {
  206. return _stats.getNumberOfValues();
  207. }
  208. void setNumberOfValues(uint64_t value) override {
  209. _stats.setNumberOfValues(value);
  210. }
  211. void increase(uint64_t count) override {
  212. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  213. }
  214. bool hasNull() const override {
  215. return _stats.hasNull();
  216. }
  217. void setHasNull(bool hasNull) override {
  218. _stats.setHasNull(hasNull);
  219. }
  220. void merge(const MutableColumnStatistics& other) override {
  221. _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
  222. }
  223. void reset() override {
  224. _stats.reset();
  225. }
  226. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  227. pbStats.set_has_null(_stats.hasNull());
  228. pbStats.set_number_of_values(_stats.getNumberOfValues());
  229. }
  230. std::string toString() const override {
  231. std::ostringstream buffer;
  232. buffer << "Column has " << getNumberOfValues() << " values"
  233. << " and has null value: " << (hasNull() ? "yes" : "no") << std::endl;
  234. return buffer.str();
  235. }
  236. };
  237. class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics {
  238. private:
  239. InternalCharStatistics _stats;
  240. public:
  241. BinaryColumnStatisticsImpl() {
  242. reset();
  243. }
  244. BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
  245. const StatContext& statContext);
  246. virtual ~BinaryColumnStatisticsImpl() override;
  247. uint64_t getNumberOfValues() const override {
  248. return _stats.getNumberOfValues();
  249. }
  250. void setNumberOfValues(uint64_t value) override {
  251. _stats.setNumberOfValues(value);
  252. }
  253. void increase(uint64_t count) override {
  254. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  255. }
  256. bool hasNull() const override {
  257. return _stats.hasNull();
  258. }
  259. void setHasNull(bool hasNull) override {
  260. _stats.setHasNull(hasNull);
  261. }
  262. bool hasTotalLength() const override {
  263. return _stats.hasTotalLength();
  264. }
  265. uint64_t getTotalLength() const override {
  266. if (hasTotalLength()) {
  267. return _stats.getTotalLength();
  268. } else {
  269. throw ParseError("Total length is not defined.");
  270. }
  271. }
  272. void setTotalLength(uint64_t length) {
  273. _stats.setHasTotalLength(true);
  274. _stats.setTotalLength(length);
  275. }
  276. void update(size_t length) {
  277. _stats.setTotalLength(_stats.getTotalLength() + length);
  278. }
  279. void merge(const MutableColumnStatistics& other) override {
  280. const BinaryColumnStatisticsImpl& binStats =
  281. dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
  282. _stats.merge(binStats._stats);
  283. }
  284. void reset() override {
  285. _stats.reset();
  286. setTotalLength(0);
  287. }
  288. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  289. pbStats.set_has_null(_stats.hasNull());
  290. pbStats.set_number_of_values(_stats.getNumberOfValues());
  291. proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics();
  292. binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
  293. }
  294. std::string toString() const override {
  295. std::ostringstream buffer;
  296. buffer << "Data type: Binary" << std::endl
  297. << "Values: " << getNumberOfValues() << std::endl
  298. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  299. if (hasTotalLength()) {
  300. buffer << "Total length: " << getTotalLength() << std::endl;
  301. } else {
  302. buffer << "Total length: not defined" << std::endl;
  303. }
  304. return buffer.str();
  305. }
  306. };
  307. class BooleanColumnStatisticsImpl : public BooleanColumnStatistics,
  308. public MutableColumnStatistics {
  309. private:
  310. InternalBooleanStatistics _stats;
  311. bool _hasCount;
  312. uint64_t _trueCount;
  313. public:
  314. BooleanColumnStatisticsImpl() {
  315. reset();
  316. }
  317. BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
  318. const StatContext& statContext);
  319. virtual ~BooleanColumnStatisticsImpl() override;
  320. bool hasCount() const override {
  321. return _hasCount;
  322. }
  323. void increase(uint64_t count) override {
  324. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  325. _hasCount = true;
  326. }
  327. uint64_t getNumberOfValues() const override {
  328. return _stats.getNumberOfValues();
  329. }
  330. void setNumberOfValues(uint64_t value) override {
  331. _stats.setNumberOfValues(value);
  332. }
  333. bool hasNull() const override {
  334. return _stats.hasNull();
  335. }
  336. void setHasNull(bool hasNull) override {
  337. _stats.setHasNull(hasNull);
  338. }
  339. uint64_t getFalseCount() const override {
  340. if (hasCount()) {
  341. return getNumberOfValues() - _trueCount;
  342. } else {
  343. throw ParseError("False count is not defined.");
  344. }
  345. }
  346. uint64_t getTrueCount() const override {
  347. if (hasCount()) {
  348. return _trueCount;
  349. } else {
  350. throw ParseError("True count is not defined.");
  351. }
  352. }
  353. void setTrueCount(uint64_t trueCount) {
  354. _hasCount = true;
  355. _trueCount = trueCount;
  356. }
  357. void update(bool value, size_t repetitions) {
  358. if (value) {
  359. _trueCount += repetitions;
  360. }
  361. }
  362. void merge(const MutableColumnStatistics& other) override {
  363. const BooleanColumnStatisticsImpl& boolStats =
  364. dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
  365. _stats.merge(boolStats._stats);
  366. _hasCount = _hasCount && boolStats._hasCount;
  367. _trueCount += boolStats._trueCount;
  368. }
  369. void reset() override {
  370. _stats.reset();
  371. setTrueCount(0);
  372. }
  373. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  374. pbStats.set_has_null(_stats.hasNull());
  375. pbStats.set_number_of_values(_stats.getNumberOfValues());
  376. proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics();
  377. if (_hasCount) {
  378. bucketStats->add_count(_trueCount);
  379. } else {
  380. bucketStats->clear_count();
  381. }
  382. }
  383. std::string toString() const override {
  384. std::ostringstream buffer;
  385. buffer << "Data type: Boolean" << std::endl
  386. << "Values: " << getNumberOfValues() << std::endl
  387. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  388. if (hasCount()) {
  389. buffer << "(true: " << getTrueCount() << "; false: " << getFalseCount() << ")" << std::endl;
  390. } else {
  391. buffer << "(true: not defined; false: not defined)" << std::endl;
  392. buffer << "True and false counts are not defined" << std::endl;
  393. }
  394. return buffer.str();
  395. }
  396. };
  397. class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics {
  398. private:
  399. InternalDateStatistics _stats;
  400. public:
  401. DateColumnStatisticsImpl() {
  402. reset();
  403. }
  404. DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext);
  405. virtual ~DateColumnStatisticsImpl() override;
  406. bool hasMinimum() const override {
  407. return _stats.hasMinimum();
  408. }
  409. bool hasMaximum() const override {
  410. return _stats.hasMaximum();
  411. }
  412. void increase(uint64_t count) override {
  413. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  414. }
  415. uint64_t getNumberOfValues() const override {
  416. return _stats.getNumberOfValues();
  417. }
  418. void setNumberOfValues(uint64_t value) override {
  419. _stats.setNumberOfValues(value);
  420. }
  421. bool hasNull() const override {
  422. return _stats.hasNull();
  423. }
  424. void setHasNull(bool hasNull) override {
  425. _stats.setHasNull(hasNull);
  426. }
  427. int32_t getMinimum() const override {
  428. if (hasMinimum()) {
  429. return _stats.getMinimum();
  430. } else {
  431. throw ParseError("Minimum is not defined.");
  432. }
  433. }
  434. int32_t getMaximum() const override {
  435. if (hasMaximum()) {
  436. return _stats.getMaximum();
  437. } else {
  438. throw ParseError("Maximum is not defined.");
  439. }
  440. }
  441. void setMinimum(int32_t minimum) {
  442. _stats.setHasMinimum(true);
  443. _stats.setMinimum(minimum);
  444. }
  445. void setMaximum(int32_t maximum) {
  446. _stats.setHasMaximum(true);
  447. _stats.setMaximum(maximum);
  448. }
  449. void update(int32_t value) {
  450. _stats.updateMinMax(value);
  451. }
  452. void merge(const MutableColumnStatistics& other) override {
  453. const DateColumnStatisticsImpl& dateStats =
  454. dynamic_cast<const DateColumnStatisticsImpl&>(other);
  455. _stats.merge(dateStats._stats);
  456. }
  457. void reset() override {
  458. _stats.reset();
  459. }
  460. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  461. pbStats.set_has_null(_stats.hasNull());
  462. pbStats.set_number_of_values(_stats.getNumberOfValues());
  463. proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics();
  464. if (_stats.hasMinimum()) {
  465. dateStatistics->set_maximum(_stats.getMaximum());
  466. dateStatistics->set_minimum(_stats.getMinimum());
  467. } else {
  468. dateStatistics->clear_minimum();
  469. dateStatistics->clear_maximum();
  470. }
  471. }
  472. std::string toString() const override {
  473. std::ostringstream buffer;
  474. buffer << "Data type: Date" << std::endl
  475. << "Values: " << getNumberOfValues() << std::endl
  476. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  477. if (hasMinimum()) {
  478. buffer << "Minimum: " << getMinimum() << std::endl;
  479. } else {
  480. buffer << "Minimum: not defined" << std::endl;
  481. }
  482. if (hasMaximum()) {
  483. buffer << "Maximum: " << getMaximum() << std::endl;
  484. } else {
  485. buffer << "Maximum: not defined" << std::endl;
  486. }
  487. return buffer.str();
  488. }
  489. };
  490. class DecimalColumnStatisticsImpl : public DecimalColumnStatistics,
  491. public MutableColumnStatistics {
  492. private:
  493. InternalDecimalStatistics _stats;
  494. public:
  495. DecimalColumnStatisticsImpl() {
  496. reset();
  497. }
  498. DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
  499. const StatContext& statContext);
  500. virtual ~DecimalColumnStatisticsImpl() override;
  501. bool hasMinimum() const override {
  502. return _stats.hasMinimum();
  503. }
  504. bool hasMaximum() const override {
  505. return _stats.hasMaximum();
  506. }
  507. bool hasSum() const override {
  508. return _stats.hasSum();
  509. }
  510. void increase(uint64_t count) override {
  511. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  512. }
  513. uint64_t getNumberOfValues() const override {
  514. return _stats.getNumberOfValues();
  515. }
  516. void setNumberOfValues(uint64_t value) override {
  517. _stats.setNumberOfValues(value);
  518. }
  519. bool hasNull() const override {
  520. return _stats.hasNull();
  521. }
  522. void setHasNull(bool hasNull) override {
  523. _stats.setHasNull(hasNull);
  524. }
  525. Decimal getMinimum() const override {
  526. if (hasMinimum()) {
  527. return _stats.getMinimum();
  528. } else {
  529. throw ParseError("Minimum is not defined.");
  530. }
  531. }
  532. Decimal getMaximum() const override {
  533. if (hasMaximum()) {
  534. return _stats.getMaximum();
  535. } else {
  536. throw ParseError("Maximum is not defined.");
  537. }
  538. }
  539. void setMinimum(Decimal minimum) {
  540. _stats.setHasMinimum(true);
  541. _stats.setMinimum(minimum);
  542. }
  543. void setMaximum(Decimal maximum) {
  544. _stats.setHasMaximum(true);
  545. _stats.setMaximum(maximum);
  546. }
  547. Decimal getSum() const override {
  548. if (hasSum()) {
  549. return _stats.getSum();
  550. } else {
  551. throw ParseError("Sum is not defined.");
  552. }
  553. }
  554. void setSum(Decimal sum) {
  555. _stats.setHasSum(true);
  556. _stats.setSum(sum);
  557. }
  558. void update(const Decimal& value) {
  559. _stats.updateMinMax(value);
  560. if (_stats.hasSum()) {
  561. updateSum(value);
  562. }
  563. }
  564. void merge(const MutableColumnStatistics& other) override {
  565. const DecimalColumnStatisticsImpl& decStats =
  566. dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
  567. _stats.merge(decStats._stats);
  568. _stats.setHasSum(_stats.hasSum() && decStats.hasSum());
  569. if (_stats.hasSum()) {
  570. updateSum(decStats.getSum());
  571. }
  572. }
  573. void reset() override {
  574. _stats.reset();
  575. setSum(Decimal());
  576. }
  577. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  578. pbStats.set_has_null(_stats.hasNull());
  579. pbStats.set_number_of_values(_stats.getNumberOfValues());
  580. proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics();
  581. if (_stats.hasMinimum()) {
  582. decStats->set_minimum(TString(_stats.getMinimum().toString(true)));
  583. decStats->set_maximum(TString(_stats.getMaximum().toString(true)));
  584. } else {
  585. decStats->clear_minimum();
  586. decStats->clear_maximum();
  587. }
  588. if (_stats.hasSum()) {
  589. decStats->set_sum(TString(_stats.getSum().toString(true)));
  590. } else {
  591. decStats->clear_sum();
  592. }
  593. }
  594. std::string toString() const override {
  595. std::ostringstream buffer;
  596. buffer << "Data type: Decimal" << std::endl
  597. << "Values: " << getNumberOfValues() << std::endl
  598. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  599. if (hasMinimum()) {
  600. buffer << "Minimum: " << getMinimum().toString() << std::endl;
  601. } else {
  602. buffer << "Minimum: not defined" << std::endl;
  603. }
  604. if (hasMaximum()) {
  605. buffer << "Maximum: " << getMaximum().toString() << std::endl;
  606. } else {
  607. buffer << "Maximum: not defined" << std::endl;
  608. }
  609. if (hasSum()) {
  610. buffer << "Sum: " << getSum().toString() << std::endl;
  611. } else {
  612. buffer << "Sum: not defined" << std::endl;
  613. }
  614. return buffer.str();
  615. }
  616. private:
  617. void updateSum(Decimal value) {
  618. if (_stats.hasSum()) {
  619. bool overflow = false;
  620. Decimal sum = _stats.getSum();
  621. if (sum.scale > value.scale) {
  622. value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow);
  623. } else if (sum.scale < value.scale) {
  624. sum.value = scaleUpInt128ByPowerOfTen(sum.value, value.scale - sum.scale, overflow);
  625. sum.scale = value.scale;
  626. }
  627. if (!overflow) {
  628. bool wasPositive = sum.value >= 0;
  629. sum.value += value.value;
  630. if ((value.value >= 0) == wasPositive) {
  631. _stats.setHasSum((sum.value >= 0) == wasPositive);
  632. }
  633. } else {
  634. _stats.setHasSum(false);
  635. }
  636. if (_stats.hasSum()) {
  637. _stats.setSum(sum);
  638. }
  639. }
  640. }
  641. };
  642. class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics {
  643. private:
  644. InternalDoubleStatistics _stats;
  645. public:
  646. DoubleColumnStatisticsImpl() {
  647. reset();
  648. }
  649. DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
  650. virtual ~DoubleColumnStatisticsImpl() override;
  651. bool hasMinimum() const override {
  652. return _stats.hasMinimum();
  653. }
  654. bool hasMaximum() const override {
  655. return _stats.hasMaximum();
  656. }
  657. bool hasSum() const override {
  658. return _stats.hasSum();
  659. }
  660. void increase(uint64_t count) override {
  661. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  662. }
  663. uint64_t getNumberOfValues() const override {
  664. return _stats.getNumberOfValues();
  665. }
  666. void setNumberOfValues(uint64_t value) override {
  667. _stats.setNumberOfValues(value);
  668. }
  669. bool hasNull() const override {
  670. return _stats.hasNull();
  671. }
  672. void setHasNull(bool hasNull) override {
  673. _stats.setHasNull(hasNull);
  674. }
  675. double getMinimum() const override {
  676. if (hasMinimum()) {
  677. return _stats.getMinimum();
  678. } else {
  679. throw ParseError("Minimum is not defined.");
  680. }
  681. }
  682. double getMaximum() const override {
  683. if (hasMaximum()) {
  684. return _stats.getMaximum();
  685. } else {
  686. throw ParseError("Maximum is not defined.");
  687. }
  688. }
  689. void setMinimum(double minimum) {
  690. _stats.setHasMinimum(true);
  691. _stats.setMinimum(minimum);
  692. }
  693. void setMaximum(double maximum) {
  694. _stats.setHasMaximum(true);
  695. _stats.setMaximum(maximum);
  696. }
  697. double getSum() const override {
  698. if (hasSum()) {
  699. return _stats.getSum();
  700. } else {
  701. throw ParseError("Sum is not defined.");
  702. }
  703. }
  704. void setSum(double sum) {
  705. _stats.setHasSum(true);
  706. _stats.setSum(sum);
  707. }
  708. void update(double value) {
  709. _stats.updateMinMax(value);
  710. _stats.setSum(_stats.getSum() + value);
  711. }
  712. void merge(const MutableColumnStatistics& other) override {
  713. const DoubleColumnStatisticsImpl& doubleStats =
  714. dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
  715. _stats.merge(doubleStats._stats);
  716. _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
  717. if (_stats.hasSum()) {
  718. _stats.setSum(_stats.getSum() + doubleStats.getSum());
  719. }
  720. }
  721. void reset() override {
  722. _stats.reset();
  723. setSum(0.0);
  724. }
  725. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  726. pbStats.set_has_null(_stats.hasNull());
  727. pbStats.set_number_of_values(_stats.getNumberOfValues());
  728. proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics();
  729. if (_stats.hasMinimum()) {
  730. doubleStats->set_minimum(_stats.getMinimum());
  731. doubleStats->set_maximum(_stats.getMaximum());
  732. } else {
  733. doubleStats->clear_minimum();
  734. doubleStats->clear_maximum();
  735. }
  736. if (_stats.hasSum()) {
  737. doubleStats->set_sum(_stats.getSum());
  738. } else {
  739. doubleStats->clear_sum();
  740. }
  741. }
  742. std::string toString() const override {
  743. std::ostringstream buffer;
  744. buffer << "Data type: Double" << std::endl
  745. << "Values: " << getNumberOfValues() << std::endl
  746. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  747. if (hasMinimum()) {
  748. buffer << "Minimum: " << getMinimum() << std::endl;
  749. } else {
  750. buffer << "Minimum: not defined" << std::endl;
  751. }
  752. if (hasMaximum()) {
  753. buffer << "Maximum: " << getMaximum() << std::endl;
  754. } else {
  755. buffer << "Maximum: not defined" << std::endl;
  756. }
  757. if (hasSum()) {
  758. buffer << "Sum: " << getSum() << std::endl;
  759. } else {
  760. buffer << "Sum: not defined" << std::endl;
  761. }
  762. return buffer.str();
  763. }
  764. };
  765. class IntegerColumnStatisticsImpl : public IntegerColumnStatistics,
  766. public MutableColumnStatistics {
  767. private:
  768. InternalIntegerStatistics _stats;
  769. public:
  770. IntegerColumnStatisticsImpl() {
  771. reset();
  772. }
  773. IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
  774. virtual ~IntegerColumnStatisticsImpl() override;
  775. bool hasMinimum() const override {
  776. return _stats.hasMinimum();
  777. }
  778. bool hasMaximum() const override {
  779. return _stats.hasMaximum();
  780. }
  781. bool hasSum() const override {
  782. return _stats.hasSum();
  783. }
  784. void increase(uint64_t count) override {
  785. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  786. }
  787. uint64_t getNumberOfValues() const override {
  788. return _stats.getNumberOfValues();
  789. }
  790. void setNumberOfValues(uint64_t value) override {
  791. _stats.setNumberOfValues(value);
  792. }
  793. bool hasNull() const override {
  794. return _stats.hasNull();
  795. }
  796. void setHasNull(bool hasNull) override {
  797. _stats.setHasNull(hasNull);
  798. }
  799. int64_t getMinimum() const override {
  800. if (hasMinimum()) {
  801. return _stats.getMinimum();
  802. } else {
  803. throw ParseError("Minimum is not defined.");
  804. }
  805. }
  806. int64_t getMaximum() const override {
  807. if (hasMaximum()) {
  808. return _stats.getMaximum();
  809. } else {
  810. throw ParseError("Maximum is not defined.");
  811. }
  812. }
  813. void setMinimum(int64_t minimum) {
  814. _stats.setHasMinimum(true);
  815. _stats.setMinimum(minimum);
  816. }
  817. void setMaximum(int64_t maximum) {
  818. _stats.setHasMaximum(true);
  819. _stats.setMaximum(maximum);
  820. }
  821. int64_t getSum() const override {
  822. if (hasSum()) {
  823. return _stats.getSum();
  824. } else {
  825. throw ParseError("Sum is not defined.");
  826. }
  827. }
  828. void setSum(int64_t sum) {
  829. _stats.setHasSum(true);
  830. _stats.setSum(sum);
  831. }
  832. void update(int64_t value, int repetitions) {
  833. _stats.updateMinMax(value);
  834. if (_stats.hasSum()) {
  835. if (repetitions > 1) {
  836. _stats.setHasSum(multiplyExact(value, repetitions, &value));
  837. }
  838. if (_stats.hasSum()) {
  839. _stats.setHasSum(addExact(_stats.getSum(), value, &value));
  840. if (_stats.hasSum()) {
  841. _stats.setSum(value);
  842. }
  843. }
  844. }
  845. }
  846. void merge(const MutableColumnStatistics& other) override {
  847. const IntegerColumnStatisticsImpl& intStats =
  848. dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
  849. _stats.merge(intStats._stats);
  850. // update sum and check overflow
  851. _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
  852. if (_stats.hasSum()) {
  853. int64_t value;
  854. _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value));
  855. if (_stats.hasSum()) {
  856. _stats.setSum(value);
  857. }
  858. }
  859. }
  860. void reset() override {
  861. _stats.reset();
  862. setSum(0);
  863. }
  864. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  865. pbStats.set_has_null(_stats.hasNull());
  866. pbStats.set_number_of_values(_stats.getNumberOfValues());
  867. proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics();
  868. if (_stats.hasMinimum()) {
  869. intStats->set_minimum(_stats.getMinimum());
  870. intStats->set_maximum(_stats.getMaximum());
  871. } else {
  872. intStats->clear_minimum();
  873. intStats->clear_maximum();
  874. }
  875. if (_stats.hasSum()) {
  876. intStats->set_sum(_stats.getSum());
  877. } else {
  878. intStats->clear_sum();
  879. }
  880. }
  881. std::string toString() const override {
  882. std::ostringstream buffer;
  883. buffer << "Data type: Integer" << std::endl
  884. << "Values: " << getNumberOfValues() << std::endl
  885. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  886. if (hasMinimum()) {
  887. buffer << "Minimum: " << getMinimum() << std::endl;
  888. } else {
  889. buffer << "Minimum: not defined" << std::endl;
  890. }
  891. if (hasMaximum()) {
  892. buffer << "Maximum: " << getMaximum() << std::endl;
  893. } else {
  894. buffer << "Maximum: not defined" << std::endl;
  895. }
  896. if (hasSum()) {
  897. buffer << "Sum: " << getSum() << std::endl;
  898. } else {
  899. buffer << "Sum: not defined" << std::endl;
  900. }
  901. return buffer.str();
  902. }
  903. };
  904. class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics {
  905. private:
  906. InternalStringStatistics _stats;
  907. public:
  908. StringColumnStatisticsImpl() {
  909. reset();
  910. }
  911. StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
  912. const StatContext& statContext);
  913. virtual ~StringColumnStatisticsImpl() override;
  914. bool hasMinimum() const override {
  915. return _stats.hasMinimum();
  916. }
  917. bool hasMaximum() const override {
  918. return _stats.hasMaximum();
  919. }
  920. bool hasTotalLength() const override {
  921. return _stats.hasTotalLength();
  922. }
  923. void increase(uint64_t count) override {
  924. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  925. }
  926. uint64_t getNumberOfValues() const override {
  927. return _stats.getNumberOfValues();
  928. }
  929. void setNumberOfValues(uint64_t value) override {
  930. _stats.setNumberOfValues(value);
  931. }
  932. bool hasNull() const override {
  933. return _stats.hasNull();
  934. }
  935. void setHasNull(bool hasNull) override {
  936. _stats.setHasNull(hasNull);
  937. }
  938. const std::string& getMinimum() const override {
  939. if (hasMinimum()) {
  940. return _stats.getMinimum();
  941. } else {
  942. throw ParseError("Minimum is not defined.");
  943. }
  944. }
  945. const std::string& getMaximum() const override {
  946. if (hasMaximum()) {
  947. return _stats.getMaximum();
  948. } else {
  949. throw ParseError("Maximum is not defined.");
  950. }
  951. }
  952. void setMinimum(std::string minimum) {
  953. _stats.setHasMinimum(true);
  954. _stats.setMinimum(minimum);
  955. }
  956. void setMaximum(std::string maximum) {
  957. _stats.setHasMaximum(true);
  958. _stats.setMaximum(maximum);
  959. }
  960. uint64_t getTotalLength() const override {
  961. if (hasTotalLength()) {
  962. return _stats.getTotalLength();
  963. } else {
  964. throw ParseError("Total length is not defined.");
  965. }
  966. }
  967. void setTotalLength(uint64_t length) {
  968. _stats.setHasTotalLength(true);
  969. _stats.setTotalLength(length);
  970. }
  971. void update(const char* value, size_t length) {
  972. if (value != nullptr) {
  973. if (!_stats.hasMinimum()) {
  974. std::string tempStr(value, value + length);
  975. setMinimum(tempStr);
  976. setMaximum(tempStr);
  977. } else {
  978. // update min
  979. int minCmp = strncmp(_stats.getMinimum().c_str(), value,
  980. std::min(_stats.getMinimum().length(), length));
  981. if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) {
  982. setMinimum(std::string(value, value + length));
  983. }
  984. // update max
  985. int maxCmp = strncmp(_stats.getMaximum().c_str(), value,
  986. std::min(_stats.getMaximum().length(), length));
  987. if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) {
  988. setMaximum(std::string(value, value + length));
  989. }
  990. }
  991. }
  992. _stats.setTotalLength(_stats.getTotalLength() + length);
  993. }
  994. void update(std::string value) {
  995. update(value.c_str(), value.length());
  996. }
  997. void merge(const MutableColumnStatistics& other) override {
  998. const StringColumnStatisticsImpl& strStats =
  999. dynamic_cast<const StringColumnStatisticsImpl&>(other);
  1000. _stats.merge(strStats._stats);
  1001. }
  1002. void reset() override {
  1003. _stats.reset();
  1004. setTotalLength(0);
  1005. }
  1006. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  1007. pbStats.set_has_null(_stats.hasNull());
  1008. pbStats.set_number_of_values(_stats.getNumberOfValues());
  1009. proto::StringStatistics* strStats = pbStats.mutable_string_statistics();
  1010. if (_stats.hasMinimum()) {
  1011. strStats->set_minimum(TString(_stats.getMinimum()));
  1012. strStats->set_maximum(TString(_stats.getMaximum()));
  1013. } else {
  1014. strStats->clear_minimum();
  1015. strStats->clear_maximum();
  1016. }
  1017. if (_stats.hasTotalLength()) {
  1018. strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
  1019. } else {
  1020. strStats->clear_sum();
  1021. }
  1022. }
  1023. std::string toString() const override {
  1024. std::ostringstream buffer;
  1025. buffer << "Data type: String" << std::endl
  1026. << "Values: " << getNumberOfValues() << std::endl
  1027. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  1028. if (hasMinimum()) {
  1029. buffer << "Minimum: " << getMinimum() << std::endl;
  1030. } else {
  1031. buffer << "Minimum is not defined" << std::endl;
  1032. }
  1033. if (hasMaximum()) {
  1034. buffer << "Maximum: " << getMaximum() << std::endl;
  1035. } else {
  1036. buffer << "Maximum is not defined" << std::endl;
  1037. }
  1038. if (hasTotalLength()) {
  1039. buffer << "Total length: " << getTotalLength() << std::endl;
  1040. } else {
  1041. buffer << "Total length is not defined" << std::endl;
  1042. }
  1043. return buffer.str();
  1044. }
  1045. };
  1046. class TimestampColumnStatisticsImpl : public TimestampColumnStatistics,
  1047. public MutableColumnStatistics {
  1048. private:
  1049. InternalIntegerStatistics _stats;
  1050. bool _hasLowerBound;
  1051. bool _hasUpperBound;
  1052. int64_t _lowerBound;
  1053. int64_t _upperBound;
  1054. int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp
  1055. int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp
  1056. static constexpr int32_t DEFAULT_MIN_NANOS = 0;
  1057. static constexpr int32_t DEFAULT_MAX_NANOS = 999999;
  1058. public:
  1059. TimestampColumnStatisticsImpl() {
  1060. reset();
  1061. }
  1062. TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
  1063. const StatContext& statContext);
  1064. virtual ~TimestampColumnStatisticsImpl() override;
  1065. bool hasMinimum() const override {
  1066. return _stats.hasMinimum();
  1067. }
  1068. bool hasMaximum() const override {
  1069. return _stats.hasMaximum();
  1070. }
  1071. uint64_t getNumberOfValues() const override {
  1072. return _stats.getNumberOfValues();
  1073. }
  1074. void setNumberOfValues(uint64_t value) override {
  1075. _stats.setNumberOfValues(value);
  1076. }
  1077. void increase(uint64_t count) override {
  1078. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  1079. }
  1080. bool hasNull() const override {
  1081. return _stats.hasNull();
  1082. }
  1083. void setHasNull(bool hasNull) override {
  1084. _stats.setHasNull(hasNull);
  1085. }
  1086. int64_t getMinimum() const override {
  1087. if (hasMinimum()) {
  1088. return _stats.getMinimum();
  1089. } else {
  1090. throw ParseError("Minimum is not defined.");
  1091. }
  1092. }
  1093. int64_t getMaximum() const override {
  1094. if (hasMaximum()) {
  1095. return _stats.getMaximum();
  1096. } else {
  1097. throw ParseError("Maximum is not defined.");
  1098. }
  1099. }
  1100. void setMinimum(int64_t minimum) {
  1101. _stats.setHasMinimum(true);
  1102. _stats.setMinimum(minimum);
  1103. }
  1104. void setMaximum(int64_t maximum) {
  1105. _stats.setHasMaximum(true);
  1106. _stats.setMaximum(maximum);
  1107. }
  1108. void update(int64_t value) {
  1109. _stats.updateMinMax(value);
  1110. }
  1111. void update(int64_t milli, int32_t nano) {
  1112. if (!_stats.hasMinimum()) {
  1113. _stats.setHasMinimum(true);
  1114. _stats.setHasMaximum(true);
  1115. _stats.setMinimum(milli);
  1116. _stats.setMaximum(milli);
  1117. _maximumNanos = _minimumNanos = nano;
  1118. } else {
  1119. if (milli <= _stats.getMinimum()) {
  1120. if (milli < _stats.getMinimum() || nano < _minimumNanos) {
  1121. _minimumNanos = nano;
  1122. }
  1123. _stats.setMinimum(milli);
  1124. }
  1125. if (milli >= _stats.getMaximum()) {
  1126. if (milli > _stats.getMaximum() || nano > _maximumNanos) {
  1127. _maximumNanos = nano;
  1128. }
  1129. _stats.setMaximum(milli);
  1130. }
  1131. }
  1132. }
  1133. void merge(const MutableColumnStatistics& other) override {
  1134. const TimestampColumnStatisticsImpl& tsStats =
  1135. dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
  1136. _stats.setHasNull(_stats.hasNull() || tsStats.hasNull());
  1137. _stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues());
  1138. if (tsStats.hasMinimum()) {
  1139. if (!_stats.hasMinimum()) {
  1140. _stats.setHasMinimum(true);
  1141. _stats.setHasMaximum(true);
  1142. _stats.setMinimum(tsStats.getMinimum());
  1143. _stats.setMaximum(tsStats.getMaximum());
  1144. _minimumNanos = tsStats.getMinimumNanos();
  1145. _maximumNanos = tsStats.getMaximumNanos();
  1146. } else {
  1147. if (tsStats.getMaximum() >= _stats.getMaximum()) {
  1148. if (tsStats.getMaximum() > _stats.getMaximum() ||
  1149. tsStats.getMaximumNanos() > _maximumNanos) {
  1150. _maximumNanos = tsStats.getMaximumNanos();
  1151. }
  1152. _stats.setMaximum(tsStats.getMaximum());
  1153. }
  1154. if (tsStats.getMinimum() <= _stats.getMinimum()) {
  1155. if (tsStats.getMinimum() < _stats.getMinimum() ||
  1156. tsStats.getMinimumNanos() < _minimumNanos) {
  1157. _minimumNanos = tsStats.getMinimumNanos();
  1158. }
  1159. _stats.setMinimum(tsStats.getMinimum());
  1160. }
  1161. }
  1162. }
  1163. }
  1164. void reset() override {
  1165. _stats.reset();
  1166. _minimumNanos = DEFAULT_MIN_NANOS;
  1167. _maximumNanos = DEFAULT_MAX_NANOS;
  1168. }
  1169. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  1170. pbStats.set_has_null(_stats.hasNull());
  1171. pbStats.set_number_of_values(_stats.getNumberOfValues());
  1172. proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics();
  1173. if (_stats.hasMinimum()) {
  1174. tsStats->set_minimum_utc(_stats.getMinimum());
  1175. tsStats->set_maximum_utc(_stats.getMaximum());
  1176. if (_minimumNanos != DEFAULT_MIN_NANOS) {
  1177. tsStats->set_minimum_nanos(_minimumNanos + 1);
  1178. }
  1179. if (_maximumNanos != DEFAULT_MAX_NANOS) {
  1180. tsStats->set_maximum_nanos(_maximumNanos + 1);
  1181. }
  1182. } else {
  1183. tsStats->clear_minimum_utc();
  1184. tsStats->clear_maximum_utc();
  1185. tsStats->clear_minimum_nanos();
  1186. tsStats->clear_maximum_nanos();
  1187. }
  1188. }
  1189. std::string toString() const override {
  1190. std::ostringstream buffer;
  1191. struct tm tmValue;
  1192. char timeBuffer[20];
  1193. time_t secs = 0;
  1194. buffer << "Data type: Timestamp" << std::endl
  1195. << "Values: " << getNumberOfValues() << std::endl
  1196. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  1197. if (hasMinimum()) {
  1198. secs = static_cast<time_t>(getMinimum() / 1000);
  1199. gmtime_r(&secs, &tmValue);
  1200. strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
  1201. buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl;
  1202. } else {
  1203. buffer << "Minimum is not defined" << std::endl;
  1204. }
  1205. if (hasLowerBound()) {
  1206. secs = static_cast<time_t>(getLowerBound() / 1000);
  1207. gmtime_r(&secs, &tmValue);
  1208. strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
  1209. buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl;
  1210. } else {
  1211. buffer << "LowerBound is not defined" << std::endl;
  1212. }
  1213. if (hasMaximum()) {
  1214. secs = static_cast<time_t>(getMaximum() / 1000);
  1215. gmtime_r(&secs, &tmValue);
  1216. strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
  1217. buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl;
  1218. } else {
  1219. buffer << "Maximum is not defined" << std::endl;
  1220. }
  1221. if (hasUpperBound()) {
  1222. secs = static_cast<time_t>(getUpperBound() / 1000);
  1223. gmtime_r(&secs, &tmValue);
  1224. strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
  1225. buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl;
  1226. } else {
  1227. buffer << "UpperBound is not defined" << std::endl;
  1228. }
  1229. return buffer.str();
  1230. }
  1231. bool hasLowerBound() const override {
  1232. return _hasLowerBound;
  1233. }
  1234. bool hasUpperBound() const override {
  1235. return _hasUpperBound;
  1236. }
  1237. int64_t getLowerBound() const override {
  1238. if (hasLowerBound()) {
  1239. return _lowerBound;
  1240. } else {
  1241. throw ParseError("LowerBound is not defined.");
  1242. }
  1243. }
  1244. int64_t getUpperBound() const override {
  1245. if (hasUpperBound()) {
  1246. return _upperBound;
  1247. } else {
  1248. throw ParseError("UpperBound is not defined.");
  1249. }
  1250. }
  1251. int32_t getMinimumNanos() const override {
  1252. if (hasMinimum()) {
  1253. return _minimumNanos;
  1254. } else {
  1255. throw ParseError("Minimum is not defined.");
  1256. }
  1257. }
  1258. int32_t getMaximumNanos() const override {
  1259. if (hasMaximum()) {
  1260. return _maximumNanos;
  1261. } else {
  1262. throw ParseError("Maximum is not defined.");
  1263. }
  1264. }
  1265. };
  1266. class CollectionColumnStatisticsImpl : public CollectionColumnStatistics,
  1267. public MutableColumnStatistics {
  1268. private:
  1269. InternalCollectionStatistics _stats;
  1270. public:
  1271. CollectionColumnStatisticsImpl() {
  1272. reset();
  1273. }
  1274. CollectionColumnStatisticsImpl(const proto::ColumnStatistics& stats);
  1275. virtual ~CollectionColumnStatisticsImpl() override;
  1276. bool hasMinimumChildren() const override {
  1277. return _stats.hasMinimum();
  1278. }
  1279. bool hasMaximumChildren() const override {
  1280. return _stats.hasMaximum();
  1281. }
  1282. bool hasTotalChildren() const override {
  1283. return _stats.hasSum();
  1284. }
  1285. void increase(uint64_t count) override {
  1286. _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
  1287. }
  1288. uint64_t getNumberOfValues() const override {
  1289. return _stats.getNumberOfValues();
  1290. }
  1291. void setNumberOfValues(uint64_t value) override {
  1292. _stats.setNumberOfValues(value);
  1293. }
  1294. bool hasNull() const override {
  1295. return _stats.hasNull();
  1296. }
  1297. void setHasNull(bool hasNull) override {
  1298. _stats.setHasNull(hasNull);
  1299. }
  1300. uint64_t getMinimumChildren() const override {
  1301. if (hasMinimumChildren()) {
  1302. return _stats.getMinimum();
  1303. } else {
  1304. throw ParseError("MinimumChildren is not defined.");
  1305. }
  1306. }
  1307. uint64_t getMaximumChildren() const override {
  1308. if (hasMaximumChildren()) {
  1309. return _stats.getMaximum();
  1310. } else {
  1311. throw ParseError("MaximumChildren is not defined.");
  1312. }
  1313. }
  1314. uint64_t getTotalChildren() const override {
  1315. if (hasTotalChildren()) {
  1316. return _stats.getSum();
  1317. } else {
  1318. throw ParseError("TotalChildren is not defined.");
  1319. }
  1320. }
  1321. void setMinimumChildren(uint64_t minimum) override {
  1322. _stats.setHasMinimum(true);
  1323. _stats.setMinimum(minimum);
  1324. }
  1325. void setMaximumChildren(uint64_t maximum) override {
  1326. _stats.setHasMaximum(true);
  1327. _stats.setMaximum(maximum);
  1328. }
  1329. void setTotalChildren(uint64_t sum) override {
  1330. _stats.setHasSum(true);
  1331. _stats.setSum(sum);
  1332. }
  1333. void setHasTotalChildren(bool hasSum) override {
  1334. _stats.setHasSum(hasSum);
  1335. }
  1336. void merge(const MutableColumnStatistics& other) override {
  1337. const CollectionColumnStatisticsImpl& collectionStats =
  1338. dynamic_cast<const CollectionColumnStatisticsImpl&>(other);
  1339. _stats.merge(collectionStats._stats);
  1340. // hasSumValue here means no overflow
  1341. _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren());
  1342. if (_stats.hasSum()) {
  1343. uint64_t oldSum = _stats.getSum();
  1344. _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren());
  1345. if (oldSum > _stats.getSum()) {
  1346. _stats.setHasSum(false);
  1347. }
  1348. }
  1349. }
  1350. void reset() override {
  1351. _stats.reset();
  1352. setTotalChildren(0);
  1353. }
  1354. void update(uint64_t value) {
  1355. _stats.updateMinMax(value);
  1356. if (_stats.hasSum()) {
  1357. uint64_t oldSum = _stats.getSum();
  1358. _stats.setSum(_stats.getSum() + value);
  1359. if (oldSum > _stats.getSum()) {
  1360. _stats.setHasSum(false);
  1361. }
  1362. }
  1363. }
  1364. void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
  1365. pbStats.set_has_null(_stats.hasNull());
  1366. pbStats.set_number_of_values(_stats.getNumberOfValues());
  1367. proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics();
  1368. if (_stats.hasMinimum()) {
  1369. collectionStats->set_min_children(_stats.getMinimum());
  1370. collectionStats->set_max_children(_stats.getMaximum());
  1371. } else {
  1372. collectionStats->clear_min_children();
  1373. collectionStats->clear_max_children();
  1374. }
  1375. if (_stats.hasSum()) {
  1376. collectionStats->set_total_children(_stats.getSum());
  1377. } else {
  1378. collectionStats->clear_total_children();
  1379. }
  1380. }
  1381. std::string toString() const override {
  1382. std::ostringstream buffer;
  1383. buffer << "Data type: Collection(LIST|MAP)" << std::endl
  1384. << "Values: " << getNumberOfValues() << std::endl
  1385. << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
  1386. if (hasMinimumChildren()) {
  1387. buffer << "MinChildren: " << getMinimumChildren() << std::endl;
  1388. } else {
  1389. buffer << "MinChildren is not defined" << std::endl;
  1390. }
  1391. if (hasMaximumChildren()) {
  1392. buffer << "MaxChildren: " << getMaximumChildren() << std::endl;
  1393. } else {
  1394. buffer << "MaxChildren is not defined" << std::endl;
  1395. }
  1396. if (hasTotalChildren()) {
  1397. buffer << "TotalChildren: " << getTotalChildren() << std::endl;
  1398. } else {
  1399. buffer << "TotalChildren is not defined" << std::endl;
  1400. }
  1401. return buffer.str();
  1402. }
  1403. };
  1404. ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
  1405. const StatContext& statContext);
  1406. class StatisticsImpl : public Statistics {
  1407. private:
  1408. std::vector<ColumnStatistics*> colStats;
  1409. // DELIBERATELY NOT IMPLEMENTED
  1410. StatisticsImpl(const StatisticsImpl&);
  1411. StatisticsImpl& operator=(const StatisticsImpl&);
  1412. public:
  1413. StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext);
  1414. StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
  1415. virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
  1416. return colStats[columnId];
  1417. }
  1418. virtual ~StatisticsImpl() override;
  1419. uint32_t getNumberOfColumns() const override {
  1420. return static_cast<uint32_t>(colStats.size());
  1421. }
  1422. };
  1423. class StripeStatisticsImpl : public StripeStatistics {
  1424. private:
  1425. std::unique_ptr<StatisticsImpl> columnStats;
  1426. std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats;
  1427. // DELIBERATELY NOT IMPLEMENTED
  1428. StripeStatisticsImpl(const StripeStatisticsImpl&);
  1429. StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
  1430. public:
  1431. StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
  1432. std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
  1433. const StatContext& statContext);
  1434. virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
  1435. return columnStats->getColumnStatistics(columnId);
  1436. }
  1437. uint32_t getNumberOfColumns() const override {
  1438. return columnStats->getNumberOfColumns();
  1439. }
  1440. virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
  1441. uint32_t rowIndex) const override {
  1442. // check id indices are valid
  1443. return rowIndexStats[columnId][rowIndex].get();
  1444. }
  1445. virtual ~StripeStatisticsImpl() override;
  1446. uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
  1447. return static_cast<uint32_t>(rowIndexStats[columnId].size());
  1448. }
  1449. };
  1450. /**
  1451. * Create ColumnStatistics for writers
  1452. * @param type of column
  1453. * @return MutableColumnStatistics instances
  1454. */
  1455. std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type);
  1456. } // namespace orc
  1457. #endif