/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ORC_STATISTICS_IMPL_HH #define ORC_STATISTICS_IMPL_HH #include "orc/Common.hh" #include "orc/Int128.hh" #include "orc/OrcFile.hh" #include "orc/Reader.hh" #include "Timezone.hh" #include "TypeImpl.hh" namespace orc { /** * StatContext contains fields required to compute statistics */ struct StatContext { const bool correctStats; const Timezone* const writerTimezone; StatContext() : correctStats(false), writerTimezone(nullptr) {} StatContext(bool cStat, const Timezone* const timezone = nullptr) : correctStats(cStat), writerTimezone(timezone) {} }; /** * Internal Statistics Implementation */ template class InternalStatisticsImpl { private: bool _hasNull; bool _hasMinimum; bool _hasMaximum; bool _hasSum; bool _hasTotalLength; uint64_t _totalLength; uint64_t _valueCount; T _minimum; T _maximum; T _sum; public: InternalStatisticsImpl() { _hasNull = false; _hasMinimum = false; _hasMaximum = false; _hasSum = false; _hasTotalLength = false; _totalLength = 0; _valueCount = 0; } ~InternalStatisticsImpl() {} // GET / SET _totalLength bool hasTotalLength() const { return _hasTotalLength; } void setHasTotalLength(bool hasTotalLength) { _hasTotalLength = hasTotalLength; } uint64_t getTotalLength() const { return _totalLength; } void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } // GET / SET _sum bool hasSum() const { return _hasSum; } void setHasSum(bool hasSum) { _hasSum = hasSum; } T getSum() const { return _sum; } void setSum(T sum) { _sum = sum; } // GET / SET _maximum bool hasMaximum() const { return _hasMaximum; } const T& getMaximum() const { return _maximum; } void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } void setMaximum(T max) { _maximum = max; } // GET / SET _minimum bool hasMinimum() const { return _hasMinimum; } void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } const T& getMinimum() const { return _minimum; } void setMinimum(T min) { _minimum = min; } // GET / SET _valueCount uint64_t getNumberOfValues() const { return _valueCount; } void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } // GET / SET _hasNullValue bool hasNull() const { return _hasNull; } void setHasNull(bool hasNull) { _hasNull = hasNull; } void reset() { _hasNull = false; _hasMinimum = false; _hasMaximum = false; _hasSum = false; _hasTotalLength = false; _totalLength = 0; _valueCount = 0; } void updateMinMax(T value) { if (!_hasMinimum) { _hasMinimum = _hasMaximum = true; _minimum = _maximum = value; } else if (compare(value, _minimum)) { _minimum = value; } else if (compare(_maximum, value)) { _maximum = value; } } // sum is not merged here as we need to check overflow void merge(const InternalStatisticsImpl& other) { _hasNull = _hasNull || other._hasNull; _valueCount += other._valueCount; if (other._hasMinimum) { if (!_hasMinimum) { _hasMinimum = _hasMaximum = true; _minimum = other._minimum; _maximum = other._maximum; } else { // all template types should support operator< if (compare(_maximum, other._maximum)) { _maximum = other._maximum; } if (compare(other._minimum, _minimum)) { _minimum = other._minimum; } } } _hasTotalLength = _hasTotalLength && other._hasTotalLength; _totalLength += other._totalLength; } }; typedef InternalStatisticsImpl InternalCharStatistics; typedef InternalStatisticsImpl InternalBooleanStatistics; typedef InternalStatisticsImpl InternalIntegerStatistics; typedef InternalStatisticsImpl InternalDateStatistics; typedef InternalStatisticsImpl InternalDoubleStatistics; typedef InternalStatisticsImpl InternalDecimalStatistics; typedef InternalStatisticsImpl InternalStringStatistics; typedef InternalStatisticsImpl InternalCollectionStatistics; /** * Mutable column statistics for use by the writer. */ class MutableColumnStatistics { public: virtual ~MutableColumnStatistics(); virtual void increase(uint64_t count) = 0; virtual void setNumberOfValues(uint64_t value) = 0; virtual void setHasNull(bool hasNull) = 0; virtual void merge(const MutableColumnStatistics& other) = 0; virtual void reset() = 0; virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; }; /** * ColumnStatistics Implementation */ class ColumnStatisticsImpl : public ColumnStatistics, public MutableColumnStatistics { private: InternalCharStatistics _stats; public: ColumnStatisticsImpl() { reset(); } ColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~ColumnStatisticsImpl() override; uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } void merge(const MutableColumnStatistics& other) override { _stats.merge(dynamic_cast(other)._stats); } void reset() override { _stats.reset(); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); } std::string toString() const override { std::ostringstream buffer; buffer << "Column has " << getNumberOfValues() << " values" << " and has null value: " << (hasNull() ? "yes" : "no") << std::endl; return buffer.str(); } }; class BinaryColumnStatisticsImpl : public BinaryColumnStatistics, public MutableColumnStatistics { private: InternalCharStatistics _stats; public: BinaryColumnStatisticsImpl() { reset(); } BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~BinaryColumnStatisticsImpl() override; uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } bool hasTotalLength() const override { return _stats.hasTotalLength(); } uint64_t getTotalLength() const override { if (hasTotalLength()) { return _stats.getTotalLength(); } else { throw ParseError("Total length is not defined."); } } void setTotalLength(uint64_t length) { _stats.setHasTotalLength(true); _stats.setTotalLength(length); } void update(size_t length) { _stats.setTotalLength(_stats.getTotalLength() + length); } void merge(const MutableColumnStatistics& other) override { const BinaryColumnStatisticsImpl& binStats = dynamic_cast(other); _stats.merge(binStats._stats); } void reset() override { _stats.reset(); setTotalLength(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::BinaryStatistics* binStats = pbStats.mutable_binary_statistics(); binStats->set_sum(static_cast(_stats.getTotalLength())); } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Binary" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasTotalLength()) { buffer << "Total length: " << getTotalLength() << std::endl; } else { buffer << "Total length: not defined" << std::endl; } return buffer.str(); } }; class BooleanColumnStatisticsImpl : public BooleanColumnStatistics, public MutableColumnStatistics { private: InternalBooleanStatistics _stats; bool _hasCount; uint64_t _trueCount; public: BooleanColumnStatisticsImpl() { reset(); } BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~BooleanColumnStatisticsImpl() override; bool hasCount() const override { return _hasCount; } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); _hasCount = true; } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } uint64_t getFalseCount() const override { if (hasCount()) { return getNumberOfValues() - _trueCount; } else { throw ParseError("False count is not defined."); } } uint64_t getTrueCount() const override { if (hasCount()) { return _trueCount; } else { throw ParseError("True count is not defined."); } } void setTrueCount(uint64_t trueCount) { _hasCount = true; _trueCount = trueCount; } void update(bool value, size_t repetitions) { if (value) { _trueCount += repetitions; } } void merge(const MutableColumnStatistics& other) override { const BooleanColumnStatisticsImpl& boolStats = dynamic_cast(other); _stats.merge(boolStats._stats); _hasCount = _hasCount && boolStats._hasCount; _trueCount += boolStats._trueCount; } void reset() override { _stats.reset(); setTrueCount(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::BucketStatistics* bucketStats = pbStats.mutable_bucket_statistics(); if (_hasCount) { bucketStats->add_count(_trueCount); } else { bucketStats->clear_count(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Boolean" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasCount()) { buffer << "(true: " << getTrueCount() << "; false: " << getFalseCount() << ")" << std::endl; } else { buffer << "(true: not defined; false: not defined)" << std::endl; buffer << "True and false counts are not defined" << std::endl; } return buffer.str(); } }; class DateColumnStatisticsImpl : public DateColumnStatistics, public MutableColumnStatistics { private: InternalDateStatistics _stats; public: DateColumnStatisticsImpl() { reset(); } DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~DateColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } int32_t getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } int32_t getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int32_t minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(int32_t maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } void update(int32_t value) { _stats.updateMinMax(value); } void merge(const MutableColumnStatistics& other) override { const DateColumnStatisticsImpl& dateStats = dynamic_cast(other); _stats.merge(dateStats._stats); } void reset() override { _stats.reset(); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::DateStatistics* dateStatistics = pbStats.mutable_date_statistics(); if (_stats.hasMinimum()) { dateStatistics->set_maximum(_stats.getMaximum()); dateStatistics->set_minimum(_stats.getMinimum()); } else { dateStatistics->clear_minimum(); dateStatistics->clear_maximum(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Date" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; } else { buffer << "Minimum: not defined" << std::endl; } if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; } else { buffer << "Maximum: not defined" << std::endl; } return buffer.str(); } }; class DecimalColumnStatisticsImpl : public DecimalColumnStatistics, public MutableColumnStatistics { private: InternalDecimalStatistics _stats; public: DecimalColumnStatisticsImpl() { reset(); } DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~DecimalColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } bool hasSum() const override { return _stats.hasSum(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } Decimal getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } Decimal getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(Decimal minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(Decimal maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } Decimal getSum() const override { if (hasSum()) { return _stats.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(Decimal sum) { _stats.setHasSum(true); _stats.setSum(sum); } void update(const Decimal& value) { _stats.updateMinMax(value); if (_stats.hasSum()) { updateSum(value); } } void merge(const MutableColumnStatistics& other) override { const DecimalColumnStatisticsImpl& decStats = dynamic_cast(other); _stats.merge(decStats._stats); _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); if (_stats.hasSum()) { updateSum(decStats.getSum()); } } void reset() override { _stats.reset(); setSum(Decimal()); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::DecimalStatistics* decStats = pbStats.mutable_decimal_statistics(); if (_stats.hasMinimum()) { decStats->set_minimum(TString(_stats.getMinimum().toString(true))); decStats->set_maximum(TString(_stats.getMaximum().toString(true))); } else { decStats->clear_minimum(); decStats->clear_maximum(); } if (_stats.hasSum()) { decStats->set_sum(TString(_stats.getSum().toString(true))); } else { decStats->clear_sum(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Decimal" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { buffer << "Minimum: " << getMinimum().toString() << std::endl; } else { buffer << "Minimum: not defined" << std::endl; } if (hasMaximum()) { buffer << "Maximum: " << getMaximum().toString() << std::endl; } else { buffer << "Maximum: not defined" << std::endl; } if (hasSum()) { buffer << "Sum: " << getSum().toString() << std::endl; } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } private: void updateSum(Decimal value) { if (_stats.hasSum()) { bool overflow = false; Decimal sum = _stats.getSum(); if (sum.scale > value.scale) { value.value = scaleUpInt128ByPowerOfTen(value.value, sum.scale - value.scale, overflow); } else if (sum.scale < value.scale) { sum.value = scaleUpInt128ByPowerOfTen(sum.value, value.scale - sum.scale, overflow); sum.scale = value.scale; } if (!overflow) { bool wasPositive = sum.value >= 0; sum.value += value.value; if ((value.value >= 0) == wasPositive) { _stats.setHasSum((sum.value >= 0) == wasPositive); } } else { _stats.setHasSum(false); } if (_stats.hasSum()) { _stats.setSum(sum); } } } }; class DoubleColumnStatisticsImpl : public DoubleColumnStatistics, public MutableColumnStatistics { private: InternalDoubleStatistics _stats; public: DoubleColumnStatisticsImpl() { reset(); } DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~DoubleColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } bool hasSum() const override { return _stats.hasSum(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } double getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } double getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(double minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(double maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } double getSum() const override { if (hasSum()) { return _stats.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(double sum) { _stats.setHasSum(true); _stats.setSum(sum); } void update(double value) { _stats.updateMinMax(value); _stats.setSum(_stats.getSum() + value); } void merge(const MutableColumnStatistics& other) override { const DoubleColumnStatisticsImpl& doubleStats = dynamic_cast(other); _stats.merge(doubleStats._stats); _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); if (_stats.hasSum()) { _stats.setSum(_stats.getSum() + doubleStats.getSum()); } } void reset() override { _stats.reset(); setSum(0.0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::DoubleStatistics* doubleStats = pbStats.mutable_double_statistics(); if (_stats.hasMinimum()) { doubleStats->set_minimum(_stats.getMinimum()); doubleStats->set_maximum(_stats.getMaximum()); } else { doubleStats->clear_minimum(); doubleStats->clear_maximum(); } if (_stats.hasSum()) { doubleStats->set_sum(_stats.getSum()); } else { doubleStats->clear_sum(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Double" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; } else { buffer << "Minimum: not defined" << std::endl; } if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; } else { buffer << "Maximum: not defined" << std::endl; } if (hasSum()) { buffer << "Sum: " << getSum() << std::endl; } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } }; class IntegerColumnStatisticsImpl : public IntegerColumnStatistics, public MutableColumnStatistics { private: InternalIntegerStatistics _stats; public: IntegerColumnStatisticsImpl() { reset(); } IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~IntegerColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } bool hasSum() const override { return _stats.hasSum(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } int64_t getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } int64_t getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int64_t minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(int64_t maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } int64_t getSum() const override { if (hasSum()) { return _stats.getSum(); } else { throw ParseError("Sum is not defined."); } } void setSum(int64_t sum) { _stats.setHasSum(true); _stats.setSum(sum); } void update(int64_t value, int repetitions) { _stats.updateMinMax(value); if (_stats.hasSum()) { if (repetitions > 1) { _stats.setHasSum(multiplyExact(value, repetitions, &value)); } if (_stats.hasSum()) { _stats.setHasSum(addExact(_stats.getSum(), value, &value)); if (_stats.hasSum()) { _stats.setSum(value); } } } } void merge(const MutableColumnStatistics& other) override { const IntegerColumnStatisticsImpl& intStats = dynamic_cast(other); _stats.merge(intStats._stats); // update sum and check overflow _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); if (_stats.hasSum()) { int64_t value; _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value)); if (_stats.hasSum()) { _stats.setSum(value); } } } void reset() override { _stats.reset(); setSum(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::IntegerStatistics* intStats = pbStats.mutable_int_statistics(); if (_stats.hasMinimum()) { intStats->set_minimum(_stats.getMinimum()); intStats->set_maximum(_stats.getMaximum()); } else { intStats->clear_minimum(); intStats->clear_maximum(); } if (_stats.hasSum()) { intStats->set_sum(_stats.getSum()); } else { intStats->clear_sum(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Integer" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; } else { buffer << "Minimum: not defined" << std::endl; } if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; } else { buffer << "Maximum: not defined" << std::endl; } if (hasSum()) { buffer << "Sum: " << getSum() << std::endl; } else { buffer << "Sum: not defined" << std::endl; } return buffer.str(); } }; class StringColumnStatisticsImpl : public StringColumnStatistics, public MutableColumnStatistics { private: InternalStringStatistics _stats; public: StringColumnStatisticsImpl() { reset(); } StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~StringColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } bool hasTotalLength() const override { return _stats.hasTotalLength(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } const std::string& getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } const std::string& getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(std::string minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(std::string maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } uint64_t getTotalLength() const override { if (hasTotalLength()) { return _stats.getTotalLength(); } else { throw ParseError("Total length is not defined."); } } void setTotalLength(uint64_t length) { _stats.setHasTotalLength(true); _stats.setTotalLength(length); } void update(const char* value, size_t length) { if (value != nullptr) { if (!_stats.hasMinimum()) { std::string tempStr(value, value + length); setMinimum(tempStr); setMaximum(tempStr); } else { // update min int minCmp = strncmp(_stats.getMinimum().c_str(), value, std::min(_stats.getMinimum().length(), length)); if (minCmp > 0 || (minCmp == 0 && length < _stats.getMinimum().length())) { setMinimum(std::string(value, value + length)); } // update max int maxCmp = strncmp(_stats.getMaximum().c_str(), value, std::min(_stats.getMaximum().length(), length)); if (maxCmp < 0 || (maxCmp == 0 && length > _stats.getMaximum().length())) { setMaximum(std::string(value, value + length)); } } } _stats.setTotalLength(_stats.getTotalLength() + length); } void update(std::string value) { update(value.c_str(), value.length()); } void merge(const MutableColumnStatistics& other) override { const StringColumnStatisticsImpl& strStats = dynamic_cast(other); _stats.merge(strStats._stats); } void reset() override { _stats.reset(); setTotalLength(0); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::StringStatistics* strStats = pbStats.mutable_string_statistics(); if (_stats.hasMinimum()) { strStats->set_minimum(TString(_stats.getMinimum())); strStats->set_maximum(TString(_stats.getMaximum())); } else { strStats->clear_minimum(); strStats->clear_maximum(); } if (_stats.hasTotalLength()) { strStats->set_sum(static_cast(_stats.getTotalLength())); } else { strStats->clear_sum(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: String" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { buffer << "Minimum: " << getMinimum() << std::endl; } else { buffer << "Minimum is not defined" << std::endl; } if (hasMaximum()) { buffer << "Maximum: " << getMaximum() << std::endl; } else { buffer << "Maximum is not defined" << std::endl; } if (hasTotalLength()) { buffer << "Total length: " << getTotalLength() << std::endl; } else { buffer << "Total length is not defined" << std::endl; } return buffer.str(); } }; class TimestampColumnStatisticsImpl : public TimestampColumnStatistics, public MutableColumnStatistics { private: InternalIntegerStatistics _stats; bool _hasLowerBound; bool _hasUpperBound; int64_t _lowerBound; int64_t _upperBound; int32_t _minimumNanos; // last 6 digits of nanosecond of minimum timestamp int32_t _maximumNanos; // last 6 digits of nanosecond of maximum timestamp static constexpr int32_t DEFAULT_MIN_NANOS = 0; static constexpr int32_t DEFAULT_MAX_NANOS = 999999; public: TimestampColumnStatisticsImpl() { reset(); } TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, const StatContext& statContext); virtual ~TimestampColumnStatisticsImpl() override; bool hasMinimum() const override { return _stats.hasMinimum(); } bool hasMaximum() const override { return _stats.hasMaximum(); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } int64_t getMinimum() const override { if (hasMinimum()) { return _stats.getMinimum(); } else { throw ParseError("Minimum is not defined."); } } int64_t getMaximum() const override { if (hasMaximum()) { return _stats.getMaximum(); } else { throw ParseError("Maximum is not defined."); } } void setMinimum(int64_t minimum) { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximum(int64_t maximum) { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } void update(int64_t value) { _stats.updateMinMax(value); } void update(int64_t milli, int32_t nano) { if (!_stats.hasMinimum()) { _stats.setHasMinimum(true); _stats.setHasMaximum(true); _stats.setMinimum(milli); _stats.setMaximum(milli); _maximumNanos = _minimumNanos = nano; } else { if (milli <= _stats.getMinimum()) { if (milli < _stats.getMinimum() || nano < _minimumNanos) { _minimumNanos = nano; } _stats.setMinimum(milli); } if (milli >= _stats.getMaximum()) { if (milli > _stats.getMaximum() || nano > _maximumNanos) { _maximumNanos = nano; } _stats.setMaximum(milli); } } } void merge(const MutableColumnStatistics& other) override { const TimestampColumnStatisticsImpl& tsStats = dynamic_cast(other); _stats.setHasNull(_stats.hasNull() || tsStats.hasNull()); _stats.setNumberOfValues(_stats.getNumberOfValues() + tsStats.getNumberOfValues()); if (tsStats.hasMinimum()) { if (!_stats.hasMinimum()) { _stats.setHasMinimum(true); _stats.setHasMaximum(true); _stats.setMinimum(tsStats.getMinimum()); _stats.setMaximum(tsStats.getMaximum()); _minimumNanos = tsStats.getMinimumNanos(); _maximumNanos = tsStats.getMaximumNanos(); } else { if (tsStats.getMaximum() >= _stats.getMaximum()) { if (tsStats.getMaximum() > _stats.getMaximum() || tsStats.getMaximumNanos() > _maximumNanos) { _maximumNanos = tsStats.getMaximumNanos(); } _stats.setMaximum(tsStats.getMaximum()); } if (tsStats.getMinimum() <= _stats.getMinimum()) { if (tsStats.getMinimum() < _stats.getMinimum() || tsStats.getMinimumNanos() < _minimumNanos) { _minimumNanos = tsStats.getMinimumNanos(); } _stats.setMinimum(tsStats.getMinimum()); } } } } void reset() override { _stats.reset(); _minimumNanos = DEFAULT_MIN_NANOS; _maximumNanos = DEFAULT_MAX_NANOS; } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::TimestampStatistics* tsStats = pbStats.mutable_timestamp_statistics(); if (_stats.hasMinimum()) { tsStats->set_minimum_utc(_stats.getMinimum()); tsStats->set_maximum_utc(_stats.getMaximum()); if (_minimumNanos != DEFAULT_MIN_NANOS) { tsStats->set_minimum_nanos(_minimumNanos + 1); } if (_maximumNanos != DEFAULT_MAX_NANOS) { tsStats->set_maximum_nanos(_maximumNanos + 1); } } else { tsStats->clear_minimum_utc(); tsStats->clear_maximum_utc(); tsStats->clear_minimum_nanos(); tsStats->clear_maximum_nanos(); } } std::string toString() const override { std::ostringstream buffer; struct tm tmValue; char timeBuffer[20]; time_t secs = 0; buffer << "Data type: Timestamp" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimum()) { secs = static_cast(getMinimum() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); buffer << "Minimum: " << timeBuffer << "." << (getMinimum() % 1000) << std::endl; } else { buffer << "Minimum is not defined" << std::endl; } if (hasLowerBound()) { secs = static_cast(getLowerBound() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); buffer << "LowerBound: " << timeBuffer << "." << (getLowerBound() % 1000) << std::endl; } else { buffer << "LowerBound is not defined" << std::endl; } if (hasMaximum()) { secs = static_cast(getMaximum() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); buffer << "Maximum: " << timeBuffer << "." << (getMaximum() % 1000) << std::endl; } else { buffer << "Maximum is not defined" << std::endl; } if (hasUpperBound()) { secs = static_cast(getUpperBound() / 1000); gmtime_r(&secs, &tmValue); strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); buffer << "UpperBound: " << timeBuffer << "." << (getUpperBound() % 1000) << std::endl; } else { buffer << "UpperBound is not defined" << std::endl; } return buffer.str(); } bool hasLowerBound() const override { return _hasLowerBound; } bool hasUpperBound() const override { return _hasUpperBound; } int64_t getLowerBound() const override { if (hasLowerBound()) { return _lowerBound; } else { throw ParseError("LowerBound is not defined."); } } int64_t getUpperBound() const override { if (hasUpperBound()) { return _upperBound; } else { throw ParseError("UpperBound is not defined."); } } int32_t getMinimumNanos() const override { if (hasMinimum()) { return _minimumNanos; } else { throw ParseError("Minimum is not defined."); } } int32_t getMaximumNanos() const override { if (hasMaximum()) { return _maximumNanos; } else { throw ParseError("Maximum is not defined."); } } }; class CollectionColumnStatisticsImpl : public CollectionColumnStatistics, public MutableColumnStatistics { private: InternalCollectionStatistics _stats; public: CollectionColumnStatisticsImpl() { reset(); } CollectionColumnStatisticsImpl(const proto::ColumnStatistics& stats); virtual ~CollectionColumnStatisticsImpl() override; bool hasMinimumChildren() const override { return _stats.hasMinimum(); } bool hasMaximumChildren() const override { return _stats.hasMaximum(); } bool hasTotalChildren() const override { return _stats.hasSum(); } void increase(uint64_t count) override { _stats.setNumberOfValues(_stats.getNumberOfValues() + count); } uint64_t getNumberOfValues() const override { return _stats.getNumberOfValues(); } void setNumberOfValues(uint64_t value) override { _stats.setNumberOfValues(value); } bool hasNull() const override { return _stats.hasNull(); } void setHasNull(bool hasNull) override { _stats.setHasNull(hasNull); } uint64_t getMinimumChildren() const override { if (hasMinimumChildren()) { return _stats.getMinimum(); } else { throw ParseError("MinimumChildren is not defined."); } } uint64_t getMaximumChildren() const override { if (hasMaximumChildren()) { return _stats.getMaximum(); } else { throw ParseError("MaximumChildren is not defined."); } } uint64_t getTotalChildren() const override { if (hasTotalChildren()) { return _stats.getSum(); } else { throw ParseError("TotalChildren is not defined."); } } void setMinimumChildren(uint64_t minimum) override { _stats.setHasMinimum(true); _stats.setMinimum(minimum); } void setMaximumChildren(uint64_t maximum) override { _stats.setHasMaximum(true); _stats.setMaximum(maximum); } void setTotalChildren(uint64_t sum) override { _stats.setHasSum(true); _stats.setSum(sum); } void setHasTotalChildren(bool hasSum) override { _stats.setHasSum(hasSum); } void merge(const MutableColumnStatistics& other) override { const CollectionColumnStatisticsImpl& collectionStats = dynamic_cast(other); _stats.merge(collectionStats._stats); // hasSumValue here means no overflow _stats.setHasSum(_stats.hasSum() && collectionStats.hasTotalChildren()); if (_stats.hasSum()) { uint64_t oldSum = _stats.getSum(); _stats.setSum(_stats.getSum() + collectionStats.getTotalChildren()); if (oldSum > _stats.getSum()) { _stats.setHasSum(false); } } } void reset() override { _stats.reset(); setTotalChildren(0); } void update(uint64_t value) { _stats.updateMinMax(value); if (_stats.hasSum()) { uint64_t oldSum = _stats.getSum(); _stats.setSum(_stats.getSum() + value); if (oldSum > _stats.getSum()) { _stats.setHasSum(false); } } } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { pbStats.set_has_null(_stats.hasNull()); pbStats.set_number_of_values(_stats.getNumberOfValues()); proto::CollectionStatistics* collectionStats = pbStats.mutable_collection_statistics(); if (_stats.hasMinimum()) { collectionStats->set_min_children(_stats.getMinimum()); collectionStats->set_max_children(_stats.getMaximum()); } else { collectionStats->clear_min_children(); collectionStats->clear_max_children(); } if (_stats.hasSum()) { collectionStats->set_total_children(_stats.getSum()); } else { collectionStats->clear_total_children(); } } std::string toString() const override { std::ostringstream buffer; buffer << "Data type: Collection(LIST|MAP)" << std::endl << "Values: " << getNumberOfValues() << std::endl << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; if (hasMinimumChildren()) { buffer << "MinChildren: " << getMinimumChildren() << std::endl; } else { buffer << "MinChildren is not defined" << std::endl; } if (hasMaximumChildren()) { buffer << "MaxChildren: " << getMaximumChildren() << std::endl; } else { buffer << "MaxChildren is not defined" << std::endl; } if (hasTotalChildren()) { buffer << "TotalChildren: " << getTotalChildren() << std::endl; } else { buffer << "TotalChildren is not defined" << std::endl; } return buffer.str(); } }; ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext); class StatisticsImpl : public Statistics { private: std::vector colStats; // DELIBERATELY NOT IMPLEMENTED StatisticsImpl(const StatisticsImpl&); StatisticsImpl& operator=(const StatisticsImpl&); public: StatisticsImpl(const proto::StripeStatistics& stripeStats, const StatContext& statContext); StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { return colStats[columnId]; } virtual ~StatisticsImpl() override; uint32_t getNumberOfColumns() const override { return static_cast(colStats.size()); } }; class StripeStatisticsImpl : public StripeStatistics { private: std::unique_ptr columnStats; std::vector > > rowIndexStats; // DELIBERATELY NOT IMPLEMENTED StripeStatisticsImpl(const StripeStatisticsImpl&); StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); public: StripeStatisticsImpl(const proto::StripeStatistics& stripeStats, std::vector >& indexStats, const StatContext& statContext); virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override { return columnStats->getColumnStatistics(columnId); } uint32_t getNumberOfColumns() const override { return columnStats->getNumberOfColumns(); } virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, uint32_t rowIndex) const override { // check id indices are valid return rowIndexStats[columnId][rowIndex].get(); } virtual ~StripeStatisticsImpl() override; uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { return static_cast(rowIndexStats[columnId].size()); } }; /** * Create ColumnStatistics for writers * @param type of column * @return MutableColumnStatistics instances */ std::unique_ptr createColumnStatistics(const Type& type); } // namespace orc #endif