Просмотр исходного кода

Update contrib/libs/apache/orc to 1.8.0

primorial 2 лет назад
Родитель
Сommit
b327caf7cf

+ 6 - 0
contrib/libs/apache/orc/CMakeLists.txt

@@ -61,6 +61,12 @@ target_sources(libs-apache-orc PRIVATE
   ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/Writer.cc
   ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/io/InputStream.cc
   ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/io/OutputStream.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/Literal.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/PredicateLeaf.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/SargsApplier.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
+  ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/src/sargs/TruthValue.cc
 )
 target_proto_addincls(libs-apache-orc
   ./

+ 6 - 7
contrib/libs/apache/orc/README.md

@@ -15,18 +15,18 @@ lists, maps, and unions.
 
 ## ORC File Library
 
-This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files. But the C++ library only writes the original (Hive 0.11) version of ORC files, and will be extended in the future.
+This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files.
 
 Releases:
 * Latest: <a href="http://orc.apache.org/releases">Apache ORC releases</a>
 * Maven Central: <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a>
 * Downloads: <a href="http://orc.apache.org/downloads">Apache ORC downloads</a>
+* Release tags: <a href="https://github.com/apache/orc/releases">Apache ORC release tags</a>
+* Plan: <a href="https://github.com/apache/orc/milestones">Apache ORC future release plan</a>
 
 The current build status:
-* Master branch <a href="https://travis-ci.org/apache/orc/branches">
-![master build status](https://travis-ci.org/apache/orc.svg?branch=master)</a>
-* <a href="https://travis-ci.org/apache/orc/pull_requests">Pull Requests</a>
-
+* Main branch <a href="https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain">
+![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)</a>
 
 Bug tracking: <a href="http://orc.apache.org/bugs">Apache Jira</a>
 
@@ -39,13 +39,12 @@ The subdirectories are:
 * java - the java reader and writer
 * proto - the protocol buffer definition for the ORC metadata
 * site - the website and documentation
-* snap - the script to build [snaps](https://snapcraft.io/) of the ORC tools
 * tools - the c++ tools for reading and inspecting ORC files
 
 ### Building
 
 * Install java 1.8 or higher
-* Install maven 3 or higher
+* Install maven 3.8.6 or higher
 * Install cmake
 
 To build a release version with debug information:

+ 1 - 1
contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh

@@ -40,6 +40,6 @@ namespace orc {
     std::vector<std::shared_ptr<BloomFilter>> entries;
   };
 
-};
+}
 
 #endif //ORC_BLOOMFILTER_HH

+ 36 - 0
contrib/libs/apache/orc/c++/include/orc/Common.hh

@@ -34,6 +34,7 @@ namespace orc {
   public:
     static const FileVersion& v_0_11();
     static const FileVersion& v_0_12();
+    static const FileVersion& UNSTABLE_PRE_2_0();
 
     FileVersion(uint32_t major, uint32_t minor) :
                 majorVersion(major), minorVersion(minor) {
@@ -122,6 +123,17 @@ namespace orc {
     StreamKind_BLOOM_FILTER_UTF8 = 8
   };
 
+  /**
+   * Specific read intention when selecting a certain TypeId.
+   * This enum currently only being utilized by LIST, MAP, and UNION type selection.
+   */
+  enum ReadIntent {
+    ReadIntent_ALL = 0,
+
+    // Only read the offsets of selected type. Do not read the children types.
+    ReadIntent_OFFSETS = 1
+  };
+
   /**
    * Get the string representation of the StreamKind.
    */
@@ -281,6 +293,30 @@ namespace orc {
     FUTURE = INT32_MAX
   };
 
+  inline bool operator<(const Decimal& lhs, const Decimal& rhs) {
+    return compare(lhs, rhs);
+  }
+
+  inline bool operator>(const Decimal& lhs, const Decimal& rhs) {
+    return rhs < lhs;
+  }
+
+  inline bool operator<=(const Decimal& lhs, const Decimal& rhs) {
+    return !(lhs > rhs);
+  }
+
+  inline bool operator>=(const Decimal& lhs, const Decimal& rhs) {
+    return !(lhs < rhs);
+  }
+
+  inline bool operator!=(const Decimal& lhs, const Decimal& rhs) {
+    return lhs < rhs || rhs < lhs;
+  }
+
+  inline bool operator==(const Decimal& lhs, const Decimal& rhs) {
+    return !(lhs != rhs);
+  }
+
 }
 
 #endif

+ 6 - 1
contrib/libs/apache/orc/c++/include/orc/Int128.hh

@@ -311,8 +311,13 @@ namespace orc {
     /**
      * Return the base 10 string representation with a decimal point,
      * the given number of places after the decimal.
+     *
+     * @param scale scale of the Int128 to be interpreted as a decimal value
+     * @param trimTrailingZeros whether or not to trim trailing zeros
+     * @return converted string representation
      */
-    std::string toDecimalString(int32_t scale=0) const;
+    std::string toDecimalString(int32_t scale = 0,
+                                bool trimTrailingZeros = false) const;
 
     /**
      * Return the base 16 string representation of the two's complement with

+ 44 - 0
contrib/libs/apache/orc/c++/include/orc/Reader.hh

@@ -23,6 +23,7 @@
 #include "orc/Common.hh"
 #include "orc/orc-config.hh"
 #include "orc/Statistics.hh"
+#include "orc/sargs/SearchArgument.hh"
 #include "orc/Type.hh"
 #include "orc/Vector.hh"
 
@@ -148,6 +149,24 @@ namespace orc {
      */
     RowReaderOptions& includeTypes(const std::list<uint64_t>& types);
 
+    /**
+     * A map type of <typeId, ReadIntent>.
+     */
+    typedef std::map<uint64_t, ReadIntent> IdReadIntentMap;
+
+    /**
+     * Selects which type ids to read and specific ReadIntents for each
+     * type id. The ancestor types are automatically selected, but the children
+     * are not.
+     *
+     * This option clears any previous setting of the selected columns or
+     * types.
+     * @param idReadIntentMap a map of IdReadIntentMap.
+     * @return this
+     */
+    RowReaderOptions&
+    includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap);
+
     /**
      * Set the section of the file to process.
      * @param offset the starting byte offset
@@ -191,6 +210,11 @@ namespace orc {
      */
     RowReaderOptions& setEnableLazyDecoding(bool enable);
 
+    /**
+     * Set search argument for predicate push down
+     */
+    RowReaderOptions& searchArgument(std::unique_ptr<SearchArgument> sargs);
+
     /**
      * Should enable encoding block mode
      */
@@ -245,6 +269,26 @@ namespace orc {
      * What scale should all Hive 0.11 decimals be normalized to?
      */
     int32_t getForcedScaleOnHive11Decimal() const;
+
+    /**
+     * Get search argument for predicate push down
+     */
+    std::shared_ptr<SearchArgument> getSearchArgument() const;
+
+    /**
+     * Set desired timezone to return data of timestamp type
+     */
+    RowReaderOptions& setTimezoneName(const std::string& zoneName);
+
+    /**
+     * Get desired timezone to return data of timestamp type
+     */
+    const std::string& getTimezoneName() const;
+
+    /**
+     * Get the IdReadIntentMap map that was supplied by client.
+     */
+    const IdReadIntentMap getIdReadIntentMap() const;
   };
 
 

+ 84 - 6
contrib/libs/apache/orc/c++/include/orc/Statistics.hh

@@ -305,26 +305,26 @@ namespace orc {
     virtual ~TimestampColumnStatistics();
 
     /**
-     * Check whether column minimum.
+     * Check whether minimum timestamp exists.
      * @return true if has minimum
      */
     virtual bool hasMinimum() const = 0;
 
     /**
-     * Check whether column maximum.
+     * Check whether maximum timestamp exists.
      * @return true if has maximum
      */
     virtual bool hasMaximum() const = 0;
 
     /**
-     * Get the minimum value for the column.
-     * @return minimum value
+     * Get the millisecond of minimum timestamp in UTC.
+     * @return minimum value in millisecond
      */
     virtual int64_t getMinimum() const = 0;
 
     /**
-     * Get the maximum value for the column.
-     * @return maximum value
+     * Get the millisecond of maximum timestamp in UTC.
+     * @return maximum value in millisecond
      */
     virtual int64_t getMaximum() const = 0;
 
@@ -352,7 +352,17 @@ namespace orc {
      */
     virtual int64_t getUpperBound() const = 0;
 
+    /**
+     * Get the last 6 digits of nanosecond of minimum timestamp.
+     * @return last 6 digits of nanosecond of minimum timestamp.
+     */
+    virtual int32_t getMinimumNanos() const = 0;
 
+    /**
+     * Get the last 6 digits of nanosecond of maximum timestamp.
+     * @return last 6 digits of nanosecond of maximum timestamp.
+     */
+    virtual int32_t getMaximumNanos() const = 0;
   };
 
   class Statistics {
@@ -374,6 +384,74 @@ namespace orc {
     virtual uint32_t getNumberOfColumns() const = 0;
   };
 
+  /**
+   * Statistics for all of collections such as Map and List.
+   */
+  class CollectionColumnStatistics : public ColumnStatistics {
+  public:
+    virtual ~CollectionColumnStatistics();
+
+    /**
+     * check whether column has minimum number of children
+     * @return true if has minimum children count
+     */
+    virtual bool hasMinimumChildren() const = 0;
+
+    /**
+     * check whether column has maximum number of children
+     * @return true if has maximum children count
+     */
+    virtual bool hasMaximumChildren() const = 0;
+
+    /**
+     * check whether column has total number of children
+     * @return true if has total children count
+     */
+    virtual bool hasTotalChildren() const = 0;
+
+    /**
+     * set hasTotalChildren value
+     * @param newHasTotalChildren hasTotalChildren value
+     */
+    virtual void setHasTotalChildren(bool newHasTotalChildren) = 0;
+
+    /**
+     * Get minimum number of children in the collection.
+     * @return the minimum children count
+     */
+    virtual uint64_t getMinimumChildren() const = 0;
+
+    /**
+     * set new minimum children count
+     * @param min new minimum children count
+     */
+    virtual void setMinimumChildren(uint64_t min) = 0;
+
+    /**
+     * Get maximum number of children in the collection.
+     * @return the maximum children count
+     */
+    virtual uint64_t getMaximumChildren() const = 0;
+
+    /**
+     * set new maximum children count
+     * @param max new maximum children count
+     */
+    virtual void setMaximumChildren(uint64_t max) = 0;
+
+    /**
+     * Get the total number of children in the collection.
+     * @return the total number of children
+     */
+    virtual uint64_t getTotalChildren() const = 0;
+
+    /**
+     * set new total children count
+     * @param newTotalChildrenCount total children count to be set
+     */
+    virtual void setTotalChildren(uint64_t newTotalChildrenCount) = 0;
+  };
+
   class StripeStatistics : public Statistics {
   public:
     virtual ~StripeStatistics();

+ 8 - 1
contrib/libs/apache/orc/c++/include/orc/Type.hh

@@ -43,7 +43,8 @@ namespace orc {
     DECIMAL = 14,
     DATE = 15,
     VARCHAR = 16,
-    CHAR = 17
+    CHAR = 17,
+    TIMESTAMP_INSTANT = 18
   };
 
   class Type {
@@ -58,6 +59,12 @@ namespace orc {
     virtual uint64_t getMaximumLength() const = 0;
     virtual uint64_t getPrecision() const = 0;
     virtual uint64_t getScale() const = 0;
+    virtual Type& setAttribute(const std::string& key,
+                               const std::string& value) = 0;
+    virtual bool hasAttributeKey(const std::string& key) const = 0;
+    virtual Type& removeAttribute(const std::string& key) = 0;
+    virtual std::vector<std::string> getAttributeKeys() const = 0;
+    virtual std::string getAttributeValue(const std::string& key) const = 0;
     virtual std::string toString() const = 0;
 
     /**

+ 3 - 2
contrib/libs/apache/orc/c++/include/orc/Vector.hh

@@ -134,7 +134,7 @@ namespace orc {
     DataBuffer<int64_t> dictionaryOffset;
 
     void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) {
-      if (index < 0 || static_cast<uint64_t>(index) >= dictionaryOffset.size()) {
+      if (index < 0 || static_cast<uint64_t>(index) + 1 >= dictionaryOffset.size()) {
         throw std::out_of_range("index out of range.");
       }
 
@@ -154,6 +154,7 @@ namespace orc {
     EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool);
     virtual ~EncodedStringVectorBatch();
     std::string toString() const;
+    void resize(uint64_t capacity);
     std::shared_ptr<StringDictionary> dictionary;
 
     // index for dictionary entry
@@ -240,7 +241,7 @@ namespace orc {
     explicit Decimal(const std::string& value);
     Decimal();
 
-    std::string toString() const;
+    std::string toString(bool trimTrailingZeros = false) const;
     Int128 value;
     int32_t scale;
   };

+ 18 - 0
contrib/libs/apache/orc/c++/include/orc/Writer.hh

@@ -217,6 +217,24 @@ namespace orc {
      * Get version of BloomFilter
      */
     BloomFilterVersion getBloomFilterVersion() const;
+
+    /**
+     * Get writer timezone
+     * @return writer timezone
+     */
+    const Timezone& getTimezone() const;
+
+    /**
+     * Get writer timezone name
+     * @return writer timezone name
+     */
+    const std::string& getTimezoneName() const;
+
+    /**
+     * Set writer timezone
+     * @param zone writer timezone name
+     */
+    WriterOptions& setTimezoneName(const std::string& zone);
   };
 
   class Writer {

Некоторые файлы не были показаны из-за большого количества измененных файлов