You are viewing a plain text version of this content. The canonical link for it is here.

Posted to issues@orc.apache.org by "ffacs (via GitHub)" <gi...@apache.org> on 2023/03/27 15:55:11 UTC

[GitHub] [orc] ffacs opened a new pull request, #1454: ORC-1385 [C++] Support schema evolution from numeric to numeric

ffacs opened a new pull request, #1454:
URL: https://github.com/apache/orc/pull/1454

   ### What changes were proposed in this pull request?
   support schema evolution converting between types within {boolean, tinyint, smallint, int, bigint, float, double}
   
   
   ### Why are the changes needed?
   To support shecma evolution in c++
   
   ### How was this patch tested?
   UT passed


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155544281


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   We probably need to fix that as well.
   
   What about adding a new function to `ColumnVectorBatch` like `int32_t fixed_type_size() const` to return the width in bytes for fixed length type? For variable length type or nested type we can return -1.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] dongjoon-hyun commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "dongjoon-hyun (via GitHub)" <gi...@apache.org>.

dongjoon-hyun commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1238253277


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,446 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchPtrType>
+  static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) {
+    auto result = dynamic_cast<DestBatchPtrType>(batch);
+    if (result == nullptr) {
+      std::ostringstream ss;
+      ss << "Bad cast when convert from ColumnVectorBatch to "
+         << typeid(typename std::remove_const<
+                       typename std::remove_pointer<DestBatchPtrType>::type>::type)
+                .name();
+      throw InvalidArgument(ss.str());
+    }
+    return result;
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                            throwOnOverflow);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                          throwOnOverflow);
+        }
+      }
+    }
+  };
+
+  // { boolean, byte, short, int, long, float, double } -> { boolean }
+  template <typename FileTypeBatch>
+  class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+      : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+        }
+      }
+    }
+  };
+
+#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
+  using FROM##To##TO##ColumnReader =                  \
+      NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
+  // Floating to integer
+  DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
+  // Integer to Floating
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+
+#define CASE_CREATE_READER(TYPE, CONVERT) \
+  case TYPE:                              \
+    return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
+
+#define CASE_EXCEPTION                                                                 \
+  default:                                                                             \
+    throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
+                               _readType.toString());
+
+  std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+                                                   bool useTightNumericVector,
+                                                   bool throwOnOverflow) {
+    if (!useTightNumericVector) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
+          "option useTightNumericVector");
+    }
+    const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+
+    switch (fileType.getKind()) {
+      case BOOLEAN: {
+        switch (_readType.getKind()) {
+          CASE_CREATE_READER(BYTE, BooleanToByte);

Review Comment:
   For example, `;` is required in this line?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1161379658


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,445 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchRefType>
+  static inline DestBatchRefType SafeCastBatchTo(ColumnVectorBatch& batch, const Type& readType) {
+    try {
+      return dynamic_cast<DestBatchRefType>(batch);

Review Comment:
   Why not casting to pointer type and checks null directly? If would be more efficient to avoid try-catch on every cast.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,445 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchRefType>
+  static inline DestBatchRefType SafeCastBatchTo(ColumnVectorBatch& batch, const Type& readType) {
+    try {
+      return dynamic_cast<DestBatchRefType>(batch);
+    } catch (const std::bad_cast& e) {
+      std::ostringstream ss;
+      ss << "Bad cast when convert from ColumnVectorBatch to "
+         << typeid(typename std::remove_reference<DestBatchRefType>::type).name()
+         << " Column ID: " << readType.getColumnId() << " Type: " << readType.toString();
+      throw InvalidArgument(ss.str());
+    }
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);
+      auto& dstBatch = SafeCastBatchTo<ReadTypeBatch&>(rowBatch, readType);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                            throwOnOverflow);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                          throwOnOverflow);
+        }
+      }
+    }
+  };
+
+  // { boolean, byte, short, int, long, float, double } -> { boolean }
+  template <typename FileTypeBatch>
+  class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+      : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);

Review Comment:
   ditto



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,445 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchRefType>
+  static inline DestBatchRefType SafeCastBatchTo(ColumnVectorBatch& batch, const Type& readType) {
+    try {
+      return dynamic_cast<DestBatchRefType>(batch);
+    } catch (const std::bad_cast& e) {
+      std::ostringstream ss;
+      ss << "Bad cast when convert from ColumnVectorBatch to "
+         << typeid(typename std::remove_reference<DestBatchRefType>::type).name()
+         << " Column ID: " << readType.getColumnId() << " Type: " << readType.toString();
+      throw InvalidArgument(ss.str());
+    }
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);

Review Comment:
   It may also throw here, better to apply `SafeCastBatchTo` as well.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1238732520


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,446 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchPtrType>
+  static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) {
+    auto result = dynamic_cast<DestBatchPtrType>(batch);
+    if (result == nullptr) {
+      std::ostringstream ss;
+      ss << "Bad cast when convert from ColumnVectorBatch to "
+         << typeid(typename std::remove_const<
+                       typename std::remove_pointer<DestBatchPtrType>::type>::type)
+                .name();
+      throw InvalidArgument(ss.str());
+    }
+    return result;
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                            throwOnOverflow);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                          throwOnOverflow);
+        }
+      }
+    }
+  };
+
+  // { boolean, byte, short, int, long, float, double } -> { boolean }
+  template <typename FileTypeBatch>
+  class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+      : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+        }
+      }
+    }
+  };
+
+#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
+  using FROM##To##TO##ColumnReader =                  \
+      NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
+  // Floating to integer
+  DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
+  // Integer to Floating
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+
+#define CASE_CREATE_READER(TYPE, CONVERT) \
+  case TYPE:                              \
+    return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
+
+#define CASE_EXCEPTION                                                                 \
+  default:                                                                             \
+    throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
+                               _readType.toString());
+
+  std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+                                                   bool useTightNumericVector,
+                                                   bool throwOnOverflow) {
+    if (!useTightNumericVector) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
+          "option useTightNumericVector");
+    }
+    const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+
+    switch (fileType.getKind()) {
+      case BOOLEAN: {
+        switch (_readType.getKind()) {
+          CASE_CREATE_READER(BYTE, BooleanToByte);

Review Comment:
   How did you get the failure? Via docker test?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] coderex2522 commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "coderex2522 (via GitHub)" <gi...@apache.org>.

coderex2522 commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1162603647


##########
c++/src/Reader.hh:
##########
@@ -237,6 +241,10 @@ namespace orc {
     bool getThrowOnHive11DecimalOverflow() const;
     bool getIsDecimalAsLong() const;
     int32_t getForcedScaleOnHive11Decimal() const;
+
+    const SchemaEvolution& getSchemaEvolution() const {

Review Comment:
   Is it possible to consider unifying the return type of the getSchemaEvolution() function in class StripeStreams and class RowReaderImpl.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155519354


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   But, what if the ColumnVectorBatch is created by another reader which was createed without the useTightNumericVector option?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155533511


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   > Does the current `ColumnReader` have same issue?
   
   Yes, I think there is.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] dongjoon-hyun commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "dongjoon-hyun (via GitHub)" <gi...@apache.org>.

dongjoon-hyun commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1238264590


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,446 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchPtrType>
+  static inline DestBatchPtrType SafeCastBatchTo(ColumnVectorBatch* batch) {
+    auto result = dynamic_cast<DestBatchPtrType>(batch);
+    if (result == nullptr) {
+      std::ostringstream ss;
+      ss << "Bad cast when convert from ColumnVectorBatch to "
+         << typeid(typename std::remove_const<
+                       typename std::remove_pointer<DestBatchPtrType>::type>::type)
+                .name();
+      throw InvalidArgument(ss.str());
+    }
+    return result;
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                            throwOnOverflow);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i,
+                                          throwOnOverflow);
+        }
+      }
+    }
+  };
+
+  // { boolean, byte, short, int, long, float, double } -> { boolean }
+  template <typename FileTypeBatch>
+  class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+      : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<BooleanVectorBatch*>(&rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+        }
+      }
+    }
+  };
+
+#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
+  using FROM##To##TO##ColumnReader =                  \
+      NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
+  // Floating to integer
+  DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
+  // Integer to Floating
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+
+#define CASE_CREATE_READER(TYPE, CONVERT) \
+  case TYPE:                              \
+    return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
+
+#define CASE_EXCEPTION                                                                 \
+  default:                                                                             \
+    throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
+                               _readType.toString());
+
+  std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+                                                   bool useTightNumericVector,
+                                                   bool throwOnOverflow) {
+    if (!useTightNumericVector) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
+          "option useTightNumericVector");
+    }
+    const auto& _readType = *stripe.getSchemaEvolution()->getReadType(fileType);
+
+    switch (fileType.getKind()) {
+      case BOOLEAN: {
+        switch (_readType.getKind()) {
+          CASE_CREATE_READER(BYTE, BooleanToByte);

Review Comment:
   This is fixed via https://github.com/apache/orc/pull/1500#discussion_r1238261627



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1159726939


##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();

Review Comment:
   > Move it to above line 1220.
   
   I`m not sure I get why we should do this. I use the predicate `selectedSchema.get() == nullptr` to check whether it is the first time, if move line 1221 above line 1220 then it would be always true.



##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();

Review Comment:
   > Move it to above line 1220.
   
   I'm not sure I get why we should do this. I use the predicate `selectedSchema.get() == nullptr` to check whether it is the first time, if move line 1221 above line 1220 then it would be always true.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1156085399


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   OK, that sounds good. 
   
   However, `isTight` here is a little bit confusing since it applies to only numeric types. We need to be careful to add any field to the public api. And it should also be used with its type together. 
   
   In the `ColumnReader`, we always check the result of dynamic_cast of the vector batch. Is it enough if we can use dynamic_cast and check if it is not null in the `ConvertColumnReader`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1161460276


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,445 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename DestBatchRefType>
+  static inline DestBatchRefType SafeCastBatchTo(ColumnVectorBatch& batch, const Type& readType) {
+    try {
+      return dynamic_cast<DestBatchRefType>(batch);

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1159277826


##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {

Review Comment:
   ```suggestion
       auto selectedFileType = &getSelectedType();
       if (schemaEvolution.getReadType()) {
   ```



##########
c++/src/ColumnReader.cc:
##########
@@ -828,7 +830,8 @@ namespace orc {
     std::vector<std::unique_ptr<ColumnReader>> children;
 
    public:
-    StructColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false);
+    StructColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false,

Review Comment:
   I am thinking if we can use a new ColumnReaderOptions to include `useTightNumericVector` and `throwOnSchemaEvolutionOverflow` in case we will add more options. But that can be a separate patch since this is not public.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                               bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);

Review Comment:
   Can we wrap `dynamic_cast` to throw a clear error message if failed? Something like
   ```cpp
   template <typename TO, typename FROM>
   TO* safe_cast(FROM* from) {
     auto to = dynamic_cast<TO*>(from);
     if (to == nullptr) {
        throw ... // maybe a new exception type for cast in Exceptions.hh
     }
     return to;
   }
   ```



##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();
+      auto readType = schemaEvolution.getReadType();
+      std::set<uint64_t> readColumns, fileColumns;
+      getColumnIds(readType, readColumns);
+      getColumnIds(fileSchema, fileColumns);
+      if (readColumns != fileColumns) {
+        std::ostringstream ss;
+        ss << "The selected schema " << fileSchema->toString() << " doesn't match read type "
+           << readType->toString();
+        throw SchemaEvolutionError(ss.str());
+      }
+    }
+    const Type& readType =
+        schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : getSelectedType();

Review Comment:
   ```suggestion
           schemaEvolution.getReadType() ? *schemaEvolution.getReadType() : *selectedFileType;
   ```



##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();

Review Comment:
   Move it to above line 1220.



##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();
+      auto readType = schemaEvolution.getReadType();
+      std::set<uint64_t> readColumns, fileColumns;
+      getColumnIds(readType, readColumns);
+      getColumnIds(fileSchema, fileColumns);

Review Comment:
   ```suggestion
         getColumnIds(selectedFileType, fileColumns);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1150018545


##########
c++/src/ColumnReader.hh:
##########
@@ -101,6 +103,12 @@ namespace orc {
      * encoded in RLE.
      */
     virtual bool isDecimalAsLong() const = 0;
+
+    /**
+     * @return get schema evolution utility object
+     *

Review Comment:
   remove this line



##########
c++/include/orc/Reader.hh:
##########
@@ -336,6 +336,16 @@ namespace orc {
      * @return if not set, the default is false
      */
     bool getUseTightNumericVector() const;
+
+    /**
+     * Set read type for schema evolution
+     */
+    RowReaderOptions& setReadType(std::shared_ptr<Type>& type);

Review Comment:
   ```suggestion
       RowReaderOptions& setReadType(std::shared_ptr<Type> type);
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe)
+      : ColumnReader(_readType, stripe), readType(_readType) {
+    reader = buildReader(fileType, stripe, true, false);
+    data = fileType.createRowBatch(0, memoryPool, false, true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  static inline void setNull(ColumnVectorBatch& dstBatch, uint64_t idx) {
+    dstBatch.notNull.data()[idx] = 0;
+    dstBatch.hasNulls = true;
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  // set null if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          setNull(destBatch, idx);
+          return;
+        }
+      }
+    } else {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+        if (destValue != destValue) {  // check is NaN
+          setNull(destBatch, idx);
+        }
+      } else {
+        if (!downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
+          setNull(destBatch, idx);
+        }
+      }
+    }
+  }
+
+  // { boolean, byte, short, int, long, float, double } ->
+  // { byte, short, int, long, float, double }
+  template <typename FileTypeBatch, typename ReadTypeBatch, typename ReadType>
+  class NumericConvertColumnReader : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe)
+        : ConvertColumnReader(_readType, fileType, stripe) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);
+      auto& dstBatch = dynamic_cast<ReadTypeBatch&>(rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          convertNumericElement<ReadType>(srcBatch.data[i], dstBatch.data[i], rowBatch, i);
+        }
+      }
+    }
+  };
+
+  // { boolean, byte, short, int, long, float, double } -> { boolean }
+  template <typename FileTypeBatch>
+  class NumericConvertColumnReader<FileTypeBatch, BooleanVectorBatch, bool>
+      : public ConvertColumnReader {
+   public:
+    NumericConvertColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe)
+        : ConvertColumnReader(_readType, fileType, stripe) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+      const auto& srcBatch = dynamic_cast<const FileTypeBatch&>(*data);
+      auto& dstBatch = dynamic_cast<BooleanVectorBatch&>(rowBatch);
+      if (rowBatch.hasNulls) {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          if (rowBatch.notNull[i]) {
+            dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+          }
+        }
+      } else {
+        for (uint64_t i = 0; i < rowBatch.numElements; ++i) {
+          dstBatch.data[i] = (static_cast<int64_t>(srcBatch.data[i]) == 0 ? 0 : 1);
+        }
+      }
+    }
+  };
+
+#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
+  using FROM##To##TO##ColumnReader =                  \
+      NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Float, float)
+  // Floating to integer
+  DEFINE_NUMERIC_CONVERT_READER(Float, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Float, Long, int64_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Boolean, bool)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Byte, int8_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Short, int16_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Int, int32_t)
+  DEFINE_NUMERIC_CONVERT_READER(Double, Long, int64_t)
+  // Integer to Floating
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Float, float)
+  DEFINE_NUMERIC_CONVERT_READER(Boolean, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Byte, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Short, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
+  DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+
+#define CASE_CREATE_READER(TYPE, CONVERT) \
+  case TYPE:                              \
+    return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe);
+
+#define CASE_EXCEPTION                                                                 \
+  default:                                                                             \
+    throw SchemaEvolutionError("Cannot convert from " + fileType.toString() + " to " + \
+                               _readType.toString());
+
+  std::unique_ptr<ColumnReader> buildConvertReader(const Type& fileType, StripeStreams& stripe,
+                                                   bool _useTightNumericVector) {
+    if (!_useTightNumericVector) {
+      throw SchemaEvolutionError("Schema Evolution only support tight numeric vector");

Review Comment:
   It would be good to provide clear guidance on which config to set to enable this.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe)
+      : ColumnReader(_readType, stripe), readType(_readType) {
+    reader = buildReader(fileType, stripe, true, false);
+    data = fileType.createRowBatch(0, memoryPool, false, true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  static inline void setNull(ColumnVectorBatch& dstBatch, uint64_t idx) {
+    dstBatch.notNull.data()[idx] = 0;
+    dstBatch.hasNulls = true;
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if (std::is_same<ReadType, int64_t>::value) {

Review Comment:
   ```suggestion
       if constexpr (std::is_same<ReadType, int64_t>::value) {
   ```



##########
c++/include/orc/Reader.hh:
##########
@@ -336,6 +336,16 @@ namespace orc {
      * @return if not set, the default is false
      */
     bool getUseTightNumericVector() const;
+
+    /**
+     * Set read type for schema evolution
+     */
+    RowReaderOptions& setReadType(std::shared_ptr<Type>& type);

Review Comment:
   In addition, we need to check if the read type matches selected columns. Otherwise we may have problems.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe)
+      : ColumnReader(_readType, stripe), readType(_readType) {
+    reader = buildReader(fileType, stripe, true, false);
+    data = fileType.createRowBatch(0, memoryPool, false, true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  static inline void setNull(ColumnVectorBatch& dstBatch, uint64_t idx) {
+    dstBatch.notNull.data()[idx] = 0;
+    dstBatch.hasNulls = true;
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  // set null if overflow

Review Comment:
   Should we add a mode to support nullify or throw when overflow is detected? Different engines may expect different behaviors.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,415 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe)
+      : ColumnReader(_readType, stripe), readType(_readType) {
+    reader = buildReader(fileType, stripe, true, false);

Review Comment:
   It is a good practice add a comment like `/*param_name=*/true` to get better readability.



##########
c++/src/ConvertColumnReader.hh:
##########
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_CONVERT_COLUMN_READER_HH
+#define ORC_CONVERT_COLUMN_READER_HH
+
+#include "ColumnReader.hh"
+#include "SchemaEvolution.hh"
+
+namespace orc {
+
+  class ConvertColumnReader : public ColumnReader {
+   public:
+    ConvertColumnReader(const Type& readType, const Type& fileType, StripeStreams& stripe);
+
+    // override next() to implement convert logic, and inherit nextEncoded(),
+    // nextLazy() and nextLazyEncoded() from ColumnReader. override them only
+    // required.

Review Comment:
   The comment seems not match the code.



##########
c++/include/orc/Reader.hh:
##########
@@ -336,6 +336,16 @@ namespace orc {
      * @return if not set, the default is false
      */
     bool getUseTightNumericVector() const;
+
+    /**
+     * Set read type for schema evolution
+     */
+    RowReaderOptions& setReadType(std::shared_ptr<Type>& type);

Review Comment:
   We can safely use std::move() in the implementation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155528318


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   Does the current `ColumnReader` have same issue?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1156013256


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   > What probably need to fix that as well.
   > 
   > What about adding a new function to `ColumnVectorBatch` like `int32_t fixed_type_size() const` to return the width in bytes for fixed length type? For variable length type or nested type we can return -1.
   
   I think what we are going to do is handle incorrect usage by users (such as passing a vector that was created by other readers). However, we don’t know what causes the failure of casting. The vector may have been created by another reader with a different read type from the current reader, or it may have been created by another reader with a different `useTightNumericVector` option. `fixed_type_size()` may not help us infer what the user did wrong, but with `isTight`, we could suggest that the user check their options.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1156109311


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   > OK, that sounds good.
   > 
   > However, `isTight` here is a little bit confusing since it applies to only numeric types. We need to be careful to add any field to the public api. And it should also be used with its type together.
   > 
   > In the `ColumnReader`, we always check the result of dynamic_cast of the vector batch. Is it enough if we can use dynamic_cast and check if it is not null in the `ConvertColumnReader`?
   
   I think that`s OK, so we will only verify the `ReaderOption` and check the result of dynamic_cast in `ConvertColumnReader`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155544281


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   What probably need to fix that as well.
   
   What about adding a new function to `ColumnVectorBatch` like `int32_t fixed_type_size() const` to return the width in bytes for fixed length type? For variable length type or nested type we can return -1.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] coderex2522 commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "coderex2522 (via GitHub)" <gi...@apache.org>.

coderex2522 commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1162603647


##########
c++/src/Reader.hh:
##########
@@ -237,6 +241,10 @@ namespace orc {
     bool getThrowOnHive11DecimalOverflow() const;
     bool getIsDecimalAsLong() const;
     int32_t getForcedScaleOnHive11Decimal() const;
+
+    const SchemaEvolution& getSchemaEvolution() const {

Review Comment:
   Is it possible to consider unifying the return type(const SchemaEvolution*) of the getSchemaEvolution() function in class StripeStreams and class RowReaderImpl.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155988630


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    if (!rowBatch.isTight) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with option "
+          "useTightNumericVector");
+    }
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());

Review Comment:
   > I guess this is not required
   
   https://github.com/apache/orc/blob/070c1a5dce3fd7f6b6b8938cc3d872a7b67b1b28/c%2B%2B/src/MemoryPool.cc#L120-L127
   When there are no nulls and we are going to set null when an overflow in conversion occurs, the `notNull` of other elements should be `1`, but they are `0` after resizing, so the `memset` is needed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1155482409


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   Is it possible not to add this? The reader can get this information by the provided `ReaderOption`



##########
c++/include/orc/Reader.hh:
##########
@@ -340,12 +340,22 @@ namespace orc {
     /**
      * Set read type for schema evolution
      */
-    RowReaderOptions& setReadType(std::shared_ptr<Type>& type);
+    RowReaderOptions& setReadType(std::shared_ptr<Type> type);
 
     /**
      * Get read type for schema evolution
      */
     std::shared_ptr<Type>& getReadType() const;
+
+    /**
+     * Set should reader throw a exception on conversion between different data types overflow

Review Comment:
   ```suggestion
        * Set whether reader throws or returns null when value overflows for schema evolution.
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    if (!rowBatch.isTight) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with option "
+          "useTightNumericVector");
+    }
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());

Review Comment:
   I guess this is not required



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,
+                         /*throwOnOverflow=*/false, /*convertToReadType*/ false);
+    data =
+        fileType.createRowBatch(0, memoryPool, /*encoded=*/false, /*useTightNumericVector=*/true);
+  }
+
+  void ConvertColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
+    if (!rowBatch.isTight) {
+      throw SchemaEvolutionError(
+          "SchemaEvolution only support tight vector, please create ColumnVectorBatch with option "
+          "useTightNumericVector");
+    }
+    reader->next(*data, numValues, notNull);
+    rowBatch.resize(data->capacity);
+    rowBatch.numElements = data->numElements;
+    rowBatch.hasNulls = data->hasNulls;
+    if (!rowBatch.hasNulls) {
+      memset(rowBatch.notNull.data(), 1, data->notNull.size());
+    } else {
+      memcpy(rowBatch.notNull.data(), data->notNull.data(), data->notNull.size());
+    }
+  }
+
+  uint64_t ConvertColumnReader::skip(uint64_t numValues) {
+    return reader->skip(numValues);
+  }
+
+  void ConvertColumnReader::seekToRowGroup(
+      std::unordered_map<uint64_t, PositionProvider>& positions) {
+    reader->seekToRowGroup(positions);
+  }
+
+  static inline bool canFitInLong(double value) {
+    constexpr double MIN_LONG_AS_DOUBLE = -0x1p63;
+    constexpr double MAX_LONG_AS_DOUBLE_PLUS_ONE = 0x1p63;
+    return ((MIN_LONG_AS_DOUBLE - value < 1.0) && (value < MAX_LONG_AS_DOUBLE_PLUS_ONE));
+  }
+
+  template <typename FileType, typename ReadType>
+  static inline void handleOverflow(ColumnVectorBatch& dstBatch, uint64_t idx, bool shouldThrow) {
+    if (!shouldThrow) {
+      dstBatch.notNull.data()[idx] = 0;
+      dstBatch.hasNulls = true;
+    } else {
+      std::ostringstream ss;
+      ss << "Overflow when convert from " << typeid(FileType).name() << " to "
+         << typeid(ReadType).name();
+      throw SchemaEvolutionError(ss.str());
+    }
+  }
+
+  // return false if overflow
+  template <typename ReadType>
+  static bool downCastToInteger(ReadType& dstValue, int64_t inputLong) {
+    dstValue = static_cast<ReadType>(inputLong);
+    if constexpr (std::is_same<ReadType, int64_t>::value) {
+      return true;
+    }
+    if (static_cast<int64_t>(dstValue) != inputLong) {
+      return false;
+    }
+    return true;
+  }
+
+  // set null or throw exception if overflow
+  template <typename ReadType, typename FileType>
+  static inline void convertNumericElement(const FileType& srcValue, ReadType& destValue,
+                                           ColumnVectorBatch& destBatch, uint64_t idx,
+                                           bool shouldThrow) {
+    constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
+    constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
+    int64_t longValue = static_cast<int64_t>(srcValue);
+    if (isFileTypeFloatingPoint) {
+      if (isReadTypeFloatingPoint) {
+        destValue = static_cast<ReadType>(srcValue);
+      } else {
+        if (!canFitInLong(static_cast<double>(srcValue)) ||
+            !downCastToInteger(destValue, longValue)) {
+          handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
+          return;

Review Comment:
   This line is redundant.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -0,0 +1,432 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvertColumnReader.hh"
+
+namespace orc {
+
+  // Assume that we are using tight numeric vector batch
+  using BooleanVectorBatch = ByteVectorBatch;
+
+  ConvertColumnReader::ConvertColumnReader(const Type& _readType, const Type& fileType,
+                                           StripeStreams& stripe, bool _throwOnOverflow)
+      : ColumnReader(_readType, stripe), readType(_readType), throwOnOverflow(_throwOnOverflow) {
+    reader = buildReader(fileType, stripe, /*useTightNumericVector=*/true,

Review Comment:
   What about add an input parameter `useTightNumericVector` to the function and throw if it is false? In this way, we don't have to check it any more like line 36 does.



##########
c++/src/ColumnReader.cc:
##########
@@ -1699,7 +1712,21 @@ namespace orc {
    * Create a reader for the given stripe.
    */
   std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe,
-                                            bool useTightNumericVector) {
+                                            bool useTightNumericVector,
+                                            bool throwOnSchemaEvolutionOverflow,
+                                            bool convertToReadType) {
+    if (convertToReadType && stripe.getSchemaEvolution()) {
+      if (stripe.getSchemaEvolution()->needConvert(type)) {
+        if (!useTightNumericVector) {
+          throw SchemaEvolutionError(
+              "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
+              "option "
+              "useTightNumericVector");

Review Comment:
   ```suggestion
                 "SchemaEvolution only support tight vector, please create ColumnVectorBatch with "
                 "option useTightNumericVector");
   ```



##########
c++/include/orc/Reader.hh:
##########
@@ -340,12 +340,22 @@ namespace orc {
     /**
      * Set read type for schema evolution
      */
-    RowReaderOptions& setReadType(std::shared_ptr<Type>& type);
+    RowReaderOptions& setReadType(std::shared_ptr<Type> type);
 
     /**
      * Get read type for schema evolution
      */
     std::shared_ptr<Type>& getReadType() const;
+
+    /**
+     * Set should reader throw a exception on conversion between different data types overflow
+     */
+    RowReaderOptions& throwOnSchemaEvolutionOverflow(bool shouldThrow);
+
+    /**
+     * Get should reader throw a exception on conversion between different data types overflow

Review Comment:
   ```suggestion
        * Whether reader throws or returns null when value overflows for schema evolution.
   ```



##########
c++/src/ColumnReader.cc:
##########
@@ -1699,7 +1712,21 @@ namespace orc {
    * Create a reader for the given stripe.
    */
   std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe,
-                                            bool useTightNumericVector) {
+                                            bool useTightNumericVector,
+                                            bool throwOnSchemaEvolutionOverflow,
+                                            bool convertToReadType) {
+    if (convertToReadType && stripe.getSchemaEvolution()) {

Review Comment:
   Can we merge these if statements to a single one?



##########
c++/src/Reader.cc:
##########
@@ -1205,8 +1215,32 @@ namespace orc {
   }
 
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {

Review Comment:
   nit: use `RowReaderImpl::getSelectedType()` to get selectedSchema so we can make sure it always exists.



##########
c++/src/Reader.cc:
##########
@@ -516,6 +518,13 @@ namespace orc {
     return forcedScaleOnHive11Decimal;
   }
 
+  void RowReaderImpl::getReadColumns(const Type* readType, std::set<uint64_t>& readColumns) const {

Review Comment:
   nit: `getReadColumns` can be a non-member function.



##########
c++/src/Reader.cc:
##########
@@ -1205,8 +1215,32 @@ namespace orc {
   }
 
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();
+      auto readType = schemaEvolution.getReadType();
+      std::set<uint64_t> readColumns;
+      getReadColumns(readType, readColumns);

Review Comment:
   What about this?
   ```cpp
   getReadColumns(readType, readColumnIds);
   getReadColumns(getSelectedType(), selectedColumnIds);
   if (readColumnIds != selectedColumnIds) {
     ...
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac merged pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac merged PR #1454:
URL: https://github.com/apache/orc/pull/1454


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] dongjoon-hyun commented on pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "dongjoon-hyun (via GitHub)" <gi...@apache.org>.

dongjoon-hyun commented on PR #1454:
URL: https://github.com/apache/orc/pull/1454#issuecomment-1602303179

   Could you double check this compilation issue, @ffacs , @wgtmac , @coderex2522 ?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] coderex2522 commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "coderex2522 (via GitHub)" <gi...@apache.org>.

coderex2522 commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1162579343


##########
c++/src/SchemaEvolution.cc:
##########
@@ -16,6 +16,8 @@
  * limitations under the License.
  */
 
+#include <iostream>

Review Comment:
   This patch does not seem to depend on the <iostream> header file?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1162812751


##########
c++/src/Reader.hh:
##########
@@ -237,6 +241,10 @@ namespace orc {
     bool getThrowOnHive11DecimalOverflow() const;
     bool getIsDecimalAsLong() const;
     int32_t getForcedScaleOnHive11Decimal() const;
+
+    const SchemaEvolution& getSchemaEvolution() const {

Review Comment:
   Done



##########
c++/src/SchemaEvolution.cc:
##########
@@ -16,6 +16,8 @@
  * limitations under the License.
  */
 
+#include <iostream>

Review Comment:
   Removed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1156109311


##########
c++/include/orc/Vector.hh:
##########
@@ -53,6 +53,10 @@ namespace orc {
     // whether the vector batch is encoded
     bool isEncoded;
 
+    // whether the vector is not a numeric vector
+    // or it's created with the option useTightNumericVector
+    bool isTight = true;

Review Comment:
   > OK, that sounds good.
   > 
   > However, `isTight` here is a little bit confusing since it applies to only numeric types. We need to be careful to add any field to the public api. And it should also be used with its type together.
   > 
   > In the `ColumnReader`, we always check the result of dynamic_cast of the vector batch. Is it enough if we can use dynamic_cast and check if it is not null in the `ConvertColumnReader`?
   
   I think that's OK, so we will only verify the `ReaderOption` and check the result of dynamic_cast in `ConvertColumnReader`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1454: ORC-1385: [C++] Support schema evolution of numeric types

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1454:
URL: https://github.com/apache/orc/pull/1454#discussion_r1160366288


##########
c++/src/Reader.cc:
##########
@@ -1204,9 +1207,33 @@ namespace orc {
     return rowsInCurrentStripe;
   }
 
+  static void getColumnIds(const Type* type, std::set<uint64_t>& columnIds) {
+    columnIds.insert(type->getColumnId());
+    for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) {
+      getColumnIds(type->getSubtype(i), columnIds);
+    }
+  }
+
   std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch(uint64_t capacity) const {
-    return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock,
-                                            useTightNumericVector);
+    // If the read type is specified, then check that the selected schema matches the read type
+    // on the first call to createRowBatch.
+    if (schemaEvolution.getReadType() && selectedSchema.get() == nullptr) {
+      auto fileSchema = &getSelectedType();

Review Comment:
   OK, that sounds good. Sorry for not reading carefully.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org