You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@orc.apache.org by GitBox <gi...@apache.org> on 2022/11/07 15:35:19 UTC
[GitHub] [orc] wgtmac commented on a diff in pull request #1307: ORC-1298:[C++] Support dedicated ColumnVectorBatch of numeric types

wgtmac commented on code in PR #1307:
URL: https://github.com/apache/orc/pull/1307#discussion_r1015142239


##########
c++/include/orc/Reader.hh:
##########
@@ -325,6 +325,18 @@ namespace orc {
      * Get the IdReadIntentMap map that was supplied by client.
      */
     const IdReadIntentMap getIdReadIntentMap() const;
+
+    /**
+     * Set the enableFixedWidthNumericVectorBatch.

Review Comment:
   Please refine the comment to help understanding because this is a public API.



##########
c++/include/orc/Reader.hh:
##########
@@ -325,6 +325,18 @@ namespace orc {
      * Get the IdReadIntentMap map that was supplied by client.
      */
     const IdReadIntentMap getIdReadIntentMap() const;
+
+    /**
+     * Set the enableFixedWidthNumericVectorBatch.
+     */
+    RowReaderOptions& setEnableFixedWidthNumericVectorBatch(

Review Comment:
   This name is too long. Maybe we can change it to setUseTightNumericVector?



##########
c++/include/orc/Type.hh:
##########
@@ -75,6 +75,12 @@ namespace orc {
                                                              bool encoded = false
                                                              ) const = 0;
 
+    virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size,
+                                                             MemoryPool& pool,
+                                                             bool encoded = false,
+                                                             bool enableFixedWidthNumericVectorBatch = false

Review Comment:
   Please shorten the new parameter



##########
c++/include/orc/Vector.hh:
##########
@@ -88,28 +89,88 @@ namespace orc {
     ColumnVectorBatch& operator=(const ColumnVectorBatch&);
   };
 
-  struct LongVectorBatch: public ColumnVectorBatch {
-    LongVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~LongVectorBatch();
+  template <typename ValueType>
+  struct IntegerVectorBatch: public ColumnVectorBatch {
+    IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+                       : ColumnVectorBatch(cap, pool),
+                       data(pool, cap) {
+      // PASS
+    }
 
-    DataBuffer<int64_t> data;
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+    virtual ~IntegerVectorBatch() {
+      // PASS
+    }
+
+    std::string toString() const {
+      std::ostringstream buffer;
+      buffer << "Long vector <" << numElements << " of " << capacity << ">";
+      return buffer.str();
+    }
+
+    void resize(uint64_t cap) {
+      if (capacity < cap) {
+        ColumnVectorBatch::resize(cap);
+        data.resize(cap);
+      }
+    }
+
+    void clear() {
+      numElements = 0;
+    }
+
+    uint64_t getMemoryUsage() {
+      return ColumnVectorBatch::getMemoryUsage() +
+        static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
+    }
+
+    DataBuffer<ValueType> data;
   };
 
-  struct DoubleVectorBatch: public ColumnVectorBatch {
-    DoubleVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~DoubleVectorBatch();
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+  using LongVectorBatch = IntegerVectorBatch<int64_t>;
+  using IntVectorBatch = IntegerVectorBatch<int32_t>;
+  using ShortVectorBatch = IntegerVectorBatch<int16_t>;
+  using ByteVectorBatch = IntegerVectorBatch<int8_t>;
+
+  template <typename FloatType>
+  struct BaseDoubleVectorBatch: public ColumnVectorBatch {
+    BaseDoubleVectorBatch(uint64_t cap, MemoryPool& pool)
+      : ColumnVectorBatch(cap, pool),
+        data(pool, cap) {
+      // PASS
+    }
+
+    virtual ~BaseDoubleVectorBatch() {

Review Comment:
   ~BaseDoubleVectorBatch() = default;



##########
c++/include/orc/Vector.hh:
##########
@@ -88,28 +89,88 @@ namespace orc {
     ColumnVectorBatch& operator=(const ColumnVectorBatch&);
   };
 
-  struct LongVectorBatch: public ColumnVectorBatch {
-    LongVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~LongVectorBatch();
+  template <typename ValueType>
+  struct IntegerVectorBatch: public ColumnVectorBatch {
+    IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+                       : ColumnVectorBatch(cap, pool),
+                       data(pool, cap) {
+      // PASS
+    }
 
-    DataBuffer<int64_t> data;
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+    virtual ~IntegerVectorBatch() {

Review Comment:
   ~IntegerVectorBatch() = default;



##########
c++/include/orc/Vector.hh:
##########
@@ -30,6 +30,7 @@
 #include <stdexcept>
 #include <cstdlib>
 #include <iostream>

Review Comment:
   Can you also remove iostream ?



##########
c++/src/ColumnWriter.cc:
##########
@@ -456,126 +456,107 @@ namespace orc {
     }
   }
 
+  template <typename BatchType, typename IntType>

Review Comment:
   IntType can be deduced from BatchType, isn't it?



##########
c++/src/ColumnWriter.cc:
##########
@@ -801,6 +782,7 @@ namespace orc {
     rleEncoder->recordPosition(rowIndexPosition.get());
   }
 
+  template<typename ValueType, typename BatchType>
   class DoubleColumnWriter : public ColumnWriter {

Review Comment:
   DoubleColumnWriter -> FloatingColumnWriter



##########
c++/src/ColumnWriter.cc:
##########
@@ -828,16 +810,17 @@ namespace orc {
     DataBuffer<char> buffer;
   };
 
-  DoubleColumnWriter::DoubleColumnWriter(
-                          const Type& type,
-                          const StreamsFactory& factory,
-                          const WriterOptions& options,
-                          bool isFloatType) :
-                              ColumnWriter(type, factory, options),
-                              isFloat(isFloatType),
-                              buffer(*options.getMemoryPool()) {
+  template<typename ValueType, typename BatchType>
+  DoubleColumnWriter<ValueType, BatchType>::DoubleColumnWriter(
+    const Type& type,
+    const StreamsFactory& factory,
+    const WriterOptions& options,
+    bool isFloatType) :

Review Comment:
   We should throw if isFloatType == false and ValueType is float



##########
c++/src/ColumnWriter.cc:
##########
@@ -3083,18 +3093,26 @@ namespace orc {
                                   options));
       case DOUBLE:
         return std::unique_ptr<ColumnWriter>(
-          new DoubleColumnWriter(
-                                 type,
-                                 factory,
-                                 options,
-                                 false));
+          new DoubleColumnWriter<double, DoubleVectorBatch>(

Review Comment:
   ditto



##########
c++/src/RLE.hh:
##########
@@ -59,8 +59,16 @@ namespace orc {
      *    pointer is not null, positions that are false are skipped.
      */
     virtual void add(const int64_t* data, uint64_t numValues,
-                      const char* notNull);
+                     const char* notNull);
 
+    virtual void add(const int32_t* data, uint64_t numValues,
+                     const char* notNull);
+
+    virtual void add(const int16_t* data, uint64_t numValues,
+                     const char* notNull);
+
+    virtual void add(const int8_t* data, uint64_t numValues,

Review Comment:
   RLE only supports int64_t, int32_t, and int16_t. Please remove the int8_t overload.



##########
c++/src/RLE.cc:
##########
@@ -71,14 +71,41 @@ namespace orc {
   }
 
   void RleEncoder::add(const int64_t* data, uint64_t numValues,
-                         const char* notNull) {
+                       const char* notNull) {
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
         write(data[i]);
       }
     }
   }
 
+  void RleEncoder::add(const int32_t* data, uint64_t numValues,

Review Comment:
   Why not use a template function to repeat the code? static_cast does not have any runtime penalty.



##########
c++/src/ColumnWriter.cc:
##########
@@ -456,126 +456,107 @@ namespace orc {
     }
   }
 
+  template <typename BatchType, typename IntType>
   class IntegerColumnWriter : public ColumnWriter {
   public:
     IntegerColumnWriter(
                         const Type& type,
                         const StreamsFactory& factory,
-                        const WriterOptions& options);
+                        const WriterOptions& options) :
+                          ColumnWriter(type, factory, options),
+                          rleVersion(options.getRleVersion()) {
+      std::unique_ptr<BufferedOutputStream> dataStream =

Review Comment:
   nit: you can still separate template class declaration with definition like before to keep the class declaration compact. See here for reference: https://stackoverflow.com/questions/13351474/c-separate-declaration-definition-for-template-function-in-template-class



##########
c++/src/ColumnWriter.cc:
##########
@@ -456,126 +456,107 @@ namespace orc {
     }
   }
 
+  template <typename BatchType, typename IntType>
   class IntegerColumnWriter : public ColumnWriter {
   public:
     IntegerColumnWriter(
                         const Type& type,
                         const StreamsFactory& factory,
-                        const WriterOptions& options);
+                        const WriterOptions& options) :
+                          ColumnWriter(type, factory, options),
+                          rleVersion(options.getRleVersion()) {
+      std::unique_ptr<BufferedOutputStream> dataStream =
+        factory.createStream(proto::Stream_Kind_DATA);
+      rleEncoder = createRleEncoder(
+                                    std::move(dataStream),
+                                    true,
+                                    rleVersion,
+                                    memPool,
+                                    options.getAlignedBitpacking());
+
+      if (enableIndex) {
+        recordPosition();
+      }
+    }
 
     virtual void add(ColumnVectorBatch& rowBatch,
                      uint64_t offset,
                      uint64_t numValues,
-                     const char* incomingMask) override;
-
-    virtual void flush(std::vector<proto::Stream>& streams) override;
-
-    virtual uint64_t getEstimatedSize() const override;
+                     const char* incomingMask) override {
+      const BatchType* intBatch =
+        dynamic_cast<const BatchType*>(&rowBatch);
+      if (intBatch == nullptr) {
+        throw InvalidArgument("Failed to cast to LongVectorBatch");

Review Comment:
   It may not be LongVectorBatch now. This error message should be changed as well.



##########
c++/src/ColumnReader.hh:
##########
@@ -167,7 +167,8 @@ namespace orc {
    * Create a reader for the given stripe.
    */
   std::unique_ptr<ColumnReader> buildReader(const Type& type,
-                                            StripeStreams& stripe);
+                                            StripeStreams& stripe,
+                                            bool enableFixedWidthNumericVectorBatch = false);

Review Comment:
   Shorten it



##########
c++/src/ColumnReader.cc:
##########
@@ -531,33 +524,34 @@ namespace orc {
     return numValues;
   }
 
-  template<TypeKind columnKind, bool isLittleEndian>
-  void DoubleColumnReader<columnKind, isLittleEndian>::next(
-      ColumnVectorBatch& rowBatch,
-      uint64_t numValues,
-      char *notNull) {
+  template<TypeKind columnKind, bool isLittleEndian, typename ValueType, typename BatchType>
+  void DoubleColumnReader<columnKind, isLittleEndian, ValueType, BatchType>::next(
+    ColumnVectorBatch& rowBatch,
+    uint64_t numValues,
+    char *notNull) {
     ColumnReader::next(rowBatch, numValues, notNull);
     // update the notNull from the parent class
     notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
-    double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data();
+    ValueType* outArray =
+      reinterpret_cast<ValueType*>(dynamic_cast<BatchType&>(rowBatch).data.data());
 
     if (columnKind == FLOAT) {

Review Comment:
   if constexpr (columnKind == FLOAT)



##########
c++/include/orc/Vector.hh:
##########
@@ -88,28 +89,88 @@ namespace orc {
     ColumnVectorBatch& operator=(const ColumnVectorBatch&);
   };
 
-  struct LongVectorBatch: public ColumnVectorBatch {
-    LongVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~LongVectorBatch();
+  template <typename ValueType>
+  struct IntegerVectorBatch: public ColumnVectorBatch {
+    IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+                       : ColumnVectorBatch(cap, pool),
+                       data(pool, cap) {
+      // PASS
+    }
 
-    DataBuffer<int64_t> data;
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+    virtual ~IntegerVectorBatch() {
+      // PASS
+    }
+
+    std::string toString() const {
+      std::ostringstream buffer;
+      buffer << "Long vector <" << numElements << " of " << capacity << ">";
+      return buffer.str();
+    }
+
+    void resize(uint64_t cap) {
+      if (capacity < cap) {
+        ColumnVectorBatch::resize(cap);
+        data.resize(cap);
+      }
+    }
+
+    void clear() {
+      numElements = 0;
+    }
+
+    uint64_t getMemoryUsage() {
+      return ColumnVectorBatch::getMemoryUsage() +
+        static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
+    }
+
+    DataBuffer<ValueType> data;
   };
 
-  struct DoubleVectorBatch: public ColumnVectorBatch {
-    DoubleVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~DoubleVectorBatch();
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+  using LongVectorBatch = IntegerVectorBatch<int64_t>;
+  using IntVectorBatch = IntegerVectorBatch<int32_t>;
+  using ShortVectorBatch = IntegerVectorBatch<int16_t>;
+  using ByteVectorBatch = IntegerVectorBatch<int8_t>;
+
+  template <typename FloatType>
+  struct BaseDoubleVectorBatch: public ColumnVectorBatch {

Review Comment:
   Rename it to FloatingVectorBatch



##########
c++/src/ColumnWriter.cc:
##########
@@ -3061,15 +3050,36 @@ namespace orc {
                                  type,
                                  factory,
                                  options));
+      case SHORT:
+        if (options.getEnableFixedWidthNumericVectorBatch()) {
+          return std::unique_ptr<ColumnWriter>(
+            new IntegerColumnWriter<ShortVectorBatch, int16_t>(
+                                                               type,
+                                                               factory,
+                                                               options));
+        }
       case INT:
+        if (options.getEnableFixedWidthNumericVectorBatch()) {
+          return std::unique_ptr<ColumnWriter>(
+            new IntegerColumnWriter<IntVectorBatch, int32_t>(
+                                                             type,
+                                                             factory,
+                                                             options));
+        }
       case LONG:
-      case SHORT:
         return std::unique_ptr<ColumnWriter>(
-          new IntegerColumnWriter(
-                                  type,
-                                  factory,
-                                  options));
+          new IntegerColumnWriter<LongVectorBatch, int64_t>(

Review Comment:
   It seems redundant to define the 2nd template parameter



##########
c++/include/orc/Vector.hh:
##########
@@ -88,28 +89,88 @@ namespace orc {
     ColumnVectorBatch& operator=(const ColumnVectorBatch&);
   };
 
-  struct LongVectorBatch: public ColumnVectorBatch {
-    LongVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~LongVectorBatch();
+  template <typename ValueType>
+  struct IntegerVectorBatch: public ColumnVectorBatch {
+    IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+                       : ColumnVectorBatch(cap, pool),
+                       data(pool, cap) {
+      // PASS
+    }
 
-    DataBuffer<int64_t> data;
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+    virtual ~IntegerVectorBatch() {
+      // PASS
+    }
+
+    std::string toString() const {
+      std::ostringstream buffer;
+      buffer << "Long vector <" << numElements << " of " << capacity << ">";
+      return buffer.str();
+    }
+
+    void resize(uint64_t cap) {
+      if (capacity < cap) {
+        ColumnVectorBatch::resize(cap);
+        data.resize(cap);
+      }
+    }
+
+    void clear() {
+      numElements = 0;
+    }
+
+    uint64_t getMemoryUsage() {
+      return ColumnVectorBatch::getMemoryUsage() +
+        static_cast<uint64_t>(data.capacity() * sizeof(int64_t));
+    }
+
+    DataBuffer<ValueType> data;
   };
 
-  struct DoubleVectorBatch: public ColumnVectorBatch {
-    DoubleVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~DoubleVectorBatch();
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+  using LongVectorBatch = IntegerVectorBatch<int64_t>;
+  using IntVectorBatch = IntegerVectorBatch<int32_t>;
+  using ShortVectorBatch = IntegerVectorBatch<int16_t>;
+  using ByteVectorBatch = IntegerVectorBatch<int8_t>;
+
+  template <typename FloatType>
+  struct BaseDoubleVectorBatch: public ColumnVectorBatch {
+    BaseDoubleVectorBatch(uint64_t cap, MemoryPool& pool)
+      : ColumnVectorBatch(cap, pool),
+        data(pool, cap) {
+      // PASS
+    }
+
+    virtual ~BaseDoubleVectorBatch() {
+      // PASS
+    }
+
+    std::string toString() const {
+      std::ostringstream buffer;
+      buffer << "Double vector <" << numElements << " of " << capacity << ">";

Review Comment:
   It may not be "Double vector" any more.



##########
c++/src/RLE.hh:
##########
@@ -126,6 +134,15 @@ namespace orc {
     virtual void next(int64_t* data, uint64_t numValues,
                       const char* notNull) = 0;
 
+    virtual void next(int32_t* data, uint64_t numValues,
+                      const char* notNull) = 0;
+
+    virtual void next(int16_t* data, uint64_t numValues,
+                      const char* notNull) = 0;
+
+    virtual void next(int8_t* data, uint64_t numValues,

Review Comment:
   ditto



##########
c++/include/orc/Vector.hh:
##########
@@ -88,28 +89,88 @@ namespace orc {
     ColumnVectorBatch& operator=(const ColumnVectorBatch&);
   };
 
-  struct LongVectorBatch: public ColumnVectorBatch {
-    LongVectorBatch(uint64_t capacity, MemoryPool& pool);
-    virtual ~LongVectorBatch();
+  template <typename ValueType>
+  struct IntegerVectorBatch: public ColumnVectorBatch {
+    IntegerVectorBatch(uint64_t cap, MemoryPool& pool)
+                       : ColumnVectorBatch(cap, pool),
+                       data(pool, cap) {
+      // PASS
+    }
 
-    DataBuffer<int64_t> data;
-    std::string toString() const;
-    void resize(uint64_t capacity);
-    void clear();
-    uint64_t getMemoryUsage();
+    virtual ~IntegerVectorBatch() {
+      // PASS
+    }
+
+    std::string toString() const {
+      std::ostringstream buffer;
+      buffer << "Long vector <" << numElements << " of " << capacity << ">";

Review Comment:
   It may not be "Long vector" any more



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org