You are viewing a plain text version of this content. The canonical link for it is here.

Posted to issues@orc.apache.org by "ffacs (via GitHub)" <gi...@apache.org> on 2023/05/16 18:08:29 UTC

[GitHub] [orc] ffacs opened a new pull request, #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

ffacs opened a new pull request, #1500:
URL: https://github.com/apache/orc/pull/1500

   ### What changes were proposed in this pull request?
   support conversion from numeric to string group/decimal/timestamp
   
   ### Why are the changes needed?
   To support schema evolution in c++
   
   ### How was this patch tested?
   UT passed
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1203418613


##########
c++/src/Timezone.cc:
##########
@@ -587,6 +587,12 @@ namespace orc {
       return clk + getVariant(clk).gmtOffset;
     }
 
+    int64_t convertFromUTC(int64_t clk) const override {

Review Comment:
   Could you add a test for this?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {

Review Comment:
   ```suggestion
             if constexpr (isFloatingFileType) {
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -235,10 +514,64 @@ namespace orc {
   DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
   DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
 
+  // Numeric to String/Char
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+  using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+  // Numeric to Decimal
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+  // Numeric to Timestamp
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
 #define CASE_CREATE_READER(TYPE, CONVERT) \
   case TYPE:                              \
     return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
 
+  const static int32_t MAX_PRECISION_64 = 18;
+
+#define CASE_CREATE_DECIMAL_READER(FROM)                                                   \
+  case DECIMAL: {                                                                          \
+    if (_readType.getPrecision() <= MAX_PRECISION_64) {                                    \

Review Comment:
   ```suggestion
       if (_readType.getPrecision() > 0 && _readType.getPrecision() <= MAX_PRECISION_64) {                                    \
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));

Review Comment:
   I am not sure if it should throw if truncation happens.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";

Review Comment:
   Does it follow the Java implementation?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,

Review Comment:
   ditto



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);
+      int32_t fromScale = 0;
+      int32_t fromPrecision = static_cast<int32_t>(strValue.length());

Review Comment:
   What if value is a negative number?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,

Review Comment:
   `_readType` and `_throwOnOverflow` do not need to have `_` prefix, right?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);

Review Comment:
   String conversion would be slow. Can we simply convert it into an integer and reuse the integer code path?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>

Review Comment:
   ```suggestion
       template <typename SrcType>
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);

Review Comment:
   Should we throw in this case? The result does not make sense at all.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);
+      int32_t fromScale = 0;
+      int32_t fromPrecision = static_cast<int32_t>(strValue.length());
+      Int128 i128 = 0;
+      for (size_t i = 0; i < strValue.length(); ++i) {
+        auto c = strValue[i];
+        if (c == '.') {
+          fromScale = static_cast<int32_t>(strValue.length() - i - 1);
+          fromPrecision -= 1;
+          continue;
+        }
+        i128 *= 10;
+        i128 += c - '0';
+      }
+      auto result = convertDecimal(i128, fromPrecision, fromScale, precision, scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    template <typename srcType>

Review Comment:
   ```suggestion
       template <typename SrcType>
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>

Review Comment:
   add one blank line above.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);
+      int32_t fromScale = 0;
+      int32_t fromPrecision = static_cast<int32_t>(strValue.length());
+      Int128 i128 = 0;
+      for (size_t i = 0; i < strValue.length(); ++i) {
+        auto c = strValue[i];
+        if (c == '.') {
+          fromScale = static_cast<int32_t>(strValue.length() - i - 1);
+          fromPrecision -= 1;
+          continue;
+        }
+        i128 *= 10;
+        i128 += c - '0';
+      }
+      auto result = convertDecimal(i128, fromPrecision, fromScale, precision, scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    template <typename srcType>
+    void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      int fromScale = 0;
+      int fromPrecision = 1;
+      for (srcType tmp = value; tmp /= 10; ++fromPrecision)
+        ;
+      auto result = convertDecimal(value, fromPrecision, fromScale, precision, scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    int32_t precision;
+    int32_t scale;
+  };
+
+  class ConvertToTimestampColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+          readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? getTimezoneByName("GMT")
+                                                                 : stripe.getReaderTimezone()),
+          needConvertTimezone(&readerTimezone != &getTimezoneByName("GMT")) {}
+
+   protected:
+    const orc::Timezone& readerTimezone;
+    const bool needConvertTimezone;
+  };
+
+  template <typename FileTypeBatch>
+  class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader {
+   public:
+    NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+        }
+      }
+    }
+
+   private:
+    template <typename FileType>
+    void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, FileType value);
+  };
+
+  template <typename FileTypeBatch>
+  template <typename FileType>
+  void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+      TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+    if constexpr (std::is_floating_point<FileType>::value) {
+      if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+          value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+        handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+        return;
+      }
+      dstBatch.data[idx] = static_cast<int64_t>(value);
+      dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+          static_cast<double>(value - static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+      if (dstBatch.nanoseconds[idx] < 0) {
+        dstBatch.data[idx] -= 1;
+        dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+      }
+    } else {
+      dstBatch.data[idx] = value;
+      dstBatch.nanoseconds[idx] = 0;
+    }
+    if (needConvertTimezone) {
+      dstBatch.data[idx] = readerTimezone.convertFromUTC(dstBatch.data[idx]);

Review Comment:
   Does this follow the Java impl as well?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);

Review Comment:
   We probably need to throw or nullify if the result needs to be truncated. Otherwise the converted string does not keep original value which is not meaningful.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221855888


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFLoatingFileType>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+      scaleMultiplier = 1;
+      bool overflow = false;
+      upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+      for (int i = 0; i < scale; i++) {
+        scaleMultiplier *= 10;
+      }
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFLoatingFileType) {

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac closed pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac closed pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp
URL: https://github.com/apache/orc/pull/1500


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221857835


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();

Review Comment:
   I make it uint64_t then it should not overflow.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209413055


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,

Review Comment:
   > `_readType` and `_throwOnOverflow` do not need to have `_` prefix, right?
   
   error: parameter 'readType' shadows member inherited from type 'ConvertColumnReader' [-Werror,-Wshadow-field]



##########
c++/src/Timezone.cc:
##########
@@ -587,6 +587,12 @@ namespace orc {
       return clk + getVariant(clk).gmtOffset;
     }
 
+    int64_t convertFromUTC(int64_t clk) const override {

Review Comment:
   > Could you add a test for this?
   
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209453953


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";

Review Comment:
   > Does it follow the Java implementation?
   
   https://github.com/apache/orc/blob/ec2ea9c6aff8b8515452df651f08695639c18cbb/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L1118-L1133
   https://github.com/apache/orc/blob/ec2ea9c6aff8b8515452df651f08695639c18cbb/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L123-L146
   I think so, but i am not good at java, please take a look



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209460653


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -235,10 +514,64 @@ namespace orc {
   DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
   DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
 
+  // Numeric to String/Char
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+  using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+  // Numeric to Decimal
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+  // Numeric to Timestamp
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
 #define CASE_CREATE_READER(TYPE, CONVERT) \
   case TYPE:                              \
     return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
 
+  const static int32_t MAX_PRECISION_64 = 18;
+
+#define CASE_CREATE_DECIMAL_READER(FROM)                                                   \
+  case DECIMAL: {                                                                          \
+    if (_readType.getPrecision() <= MAX_PRECISION_64) {                                    \

Review Comment:
   the getPrecision() returns a uint64_t value



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] dongjoon-hyun commented on pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "dongjoon-hyun (via GitHub)" <gi...@apache.org>.

dongjoon-hyun commented on PR #1500:
URL: https://github.com/apache/orc/pull/1500#issuecomment-1602323041

   Let me bring this to branch-1.9 too~


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1227022596


##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>
+  std::pair<bool, Int128> convertDecimal(T value, int32_t precision, int32_t scale) {

Review Comment:
   > Should we check if value is nan?
   
   done



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>
+  std::pair<bool, Int128> convertDecimal(T value, int32_t precision, int32_t scale) {
+    std::pair<bool, Int128> result = {false, 0};
+    if (value <= -std::ldexp(static_cast<T>(1), 127) ||

Review Comment:
   > Make std::ldexp(static_cast(1), 127) const static as it is repeated used.
   
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1222450883


##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>

Review Comment:
   Should we use std::enable_if to restrict T to be floating type only?



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>
+  std::pair<bool, Int128> convertDecimal(T value, int32_t precision, int32_t scale) {

Review Comment:
   Should we check if value is nan?



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>
+  std::pair<bool, Int128> convertDecimal(T value, int32_t precision, int32_t scale) {
+    std::pair<bool, Int128> result = {false, 0};
+    if (value <= -std::ldexp(static_cast<T>(1), 127) ||

Review Comment:
   Make std::ldexp(static_cast<T>(1), 127) const static as it is repeated used. 



##########
c++/test/TestInt128.cc:
##########
@@ -643,4 +643,242 @@ namespace orc {
     EXPECT_TRUE(Int128(-12340000).toDecimalString(8, true) == "-0.1234");
   }
 
+  TEST(Int128, testConvertDecimal) {
+    // Test convert decimal to different precision/scale
+    Int128 num = Int128(1234567890);
+
+    int fromScale = 5;
+    int toPrecision = 9;
+    int toScale = 5;
+    auto pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true) << pair.second.toString();  // overflow
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 4;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123456789));
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 2;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234568)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 7;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 8;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+  }
+
+  TEST(Int128, testConvertDecimaFromFloat) {
+    double fromDouble = 12345.13579;
+    int toPrecision = 4;
+    int toScale = 2;
+    auto pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 5;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 5;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 6;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123451)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 2;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 8;
+    toScale = 3;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345136)) << pair.second.toString();
+
+    fromDouble = -12345.13579;

Review Comment:
   Probably +0.0, -0.0, and NaN should be added



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);

Review Comment:
   Or assert fromScale, toPrecision, toScale are in the valid ranges.



##########
c++/test/TestInt128.cc:
##########
@@ -643,4 +643,242 @@ namespace orc {
     EXPECT_TRUE(Int128(-12340000).toDecimalString(8, true) == "-0.1234");
   }
 
+  TEST(Int128, testConvertDecimal) {
+    // Test convert decimal to different precision/scale
+    Int128 num = Int128(1234567890);
+
+    int fromScale = 5;
+    int toPrecision = 9;
+    int toScale = 5;
+    auto pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true) << pair.second.toString();  // overflow
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 4;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123456789));
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 2;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234568)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 7;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 8;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+  }
+
+  TEST(Int128, testConvertDecimaFromFloat) {
+    double fromDouble = 12345.13579;
+    int toPrecision = 4;
+    int toScale = 2;
+    auto pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 5;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 5;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 6;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123451)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 2;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 8;
+    toScale = 3;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345136)) << pair.second.toString();
+
+    fromDouble = -12345.13579;

Review Comment:
   Could you add more numbers to test? Would be good to test float type as well.



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);

Review Comment:
   nit: check result.first in case of invalid toPrecision



##########
c++/test/TestInt128.cc:
##########
@@ -643,4 +643,242 @@ namespace orc {
     EXPECT_TRUE(Int128(-12340000).toDecimalString(8, true) == "-0.1234");
   }
 
+  TEST(Int128, testConvertDecimal) {
+    // Test convert decimal to different precision/scale
+    Int128 num = Int128(1234567890);
+
+    int fromScale = 5;
+    int toPrecision = 9;
+    int toScale = 5;
+    auto pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true) << pair.second.toString();  // overflow
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 4;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123456789));
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 2;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234568)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 7;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 8;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+  }
+
+  TEST(Int128, testConvertDecimaFromFloat) {
+    double fromDouble = 12345.13579;
+    int toPrecision = 4;
+    int toScale = 2;
+    auto pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 5;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 5;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 6;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123451)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 2;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 8;
+    toScale = 3;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345136)) << pair.second.toString();
+
+    fromDouble = -12345.13579;

Review Comment:
   Add large numbers that are beyond 18 digits.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1227023047


##########
c++/test/TestInt128.cc:
##########
@@ -643,4 +643,242 @@ namespace orc {
     EXPECT_TRUE(Int128(-12340000).toDecimalString(8, true) == "-0.1234");
   }
 
+  TEST(Int128, testConvertDecimal) {
+    // Test convert decimal to different precision/scale
+    Int128 num = Int128(1234567890);
+
+    int fromScale = 5;
+    int toPrecision = 9;
+    int toScale = 5;
+    auto pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true) << pair.second.toString();  // overflow
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 4;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123456789));
+
+    fromScale = 5;
+    toPrecision = 9;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 2;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234568)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 10;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 0;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12346));
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 3;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 11;
+    toScale = 7;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 5;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 6;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+    fromScale = 5;
+    toPrecision = 12;
+    toScale = 8;
+    pair = convertDecimal(num, fromScale, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+  }
+
+  TEST(Int128, testConvertDecimaFromFloat) {
+    double fromDouble = 12345.13579;
+    int toPrecision = 4;
+    int toScale = 2;
+    auto pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 5;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 5;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 6;
+    toScale = 0;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 1;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(123451)) << pair.second.toString();
+
+    toPrecision = 6;
+    toScale = 2;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, true);  // overflow
+
+    toPrecision = 8;
+    toScale = 3;
+    pair = convertDecimal(fromDouble, toPrecision, toScale);
+    EXPECT_EQ(pair.first, false);  // no overflow
+    EXPECT_EQ(pair.second, Int128(12345136)) << pair.second.toString();
+
+    fromDouble = -12345.13579;

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221858130


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1227022269


##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);

Review Comment:
   > Or assert fromScale, toPrecision, toScale are in the valid ranges.
   
   done



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,84 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale, int32_t toPrecision,
+                                         int32_t toScale, bool round) {
+    std::pair<bool, Int128> result;
+    bool negative = value < 0;
+    result.second = value.abs();
+    result.first = false;
+
+    Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision, result.first);
+    int8_t roundOffset = 0;
+    int32_t deltaScale = fromScale - toScale;
+
+    if (deltaScale > 0) {
+      Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first), remainder;
+      result.second = result.second.divide(scale, remainder);
+      remainder *= 2;
+      if (round && remainder >= scale) {
+        upperBound -= 1;
+        roundOffset = 1;
+      }
+    } else if (deltaScale < 0) {
+      if (result.second > upperBound) {
+        result.first = true;
+        return result;
+      }
+      result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale, result.first);
+    }
+
+    if (result.second > upperBound) {
+      result.first = true;
+      return result;
+    }
+
+    result.second += roundOffset;
+    if (negative) {
+      result.second *= -1;
+    }
+    return result;
+  }
+
+  template <typename T>

Review Comment:
   > Should we use std::enable_if to restrict T to be floating type only?
   
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1216879773


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);

Review Comment:
   > String conversion would be slow. Can we simply convert it into an integer and reuse the integer code path?
   
   The integer portion of a double may larger then std:: numeric_limits<uint64_t>::max(), so i convert it to Int128 first.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1216875218


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -235,10 +514,64 @@ namespace orc {
   DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
   DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
 
+  // Numeric to String/Char
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+  using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+  // Numeric to Decimal
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+  // Numeric to Timestamp
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
 #define CASE_CREATE_READER(TYPE, CONVERT) \
   case TYPE:                              \
     return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
 
+  const static int32_t MAX_PRECISION_64 = 18;
+
+#define CASE_CREATE_DECIMAL_READER(FROM)                                                   \
+  case DECIMAL: {                                                                          \
+    if (_readType.getPrecision() <= MAX_PRECISION_64) {                                    \

Review Comment:
   > Yes, but precision == 0 indicates it should use int128.
   
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221864975


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;

Review Comment:
   It can't be const static because type 'CHAR' may modify them.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on PR #1500:
URL: https://github.com/apache/orc/pull/1500#issuecomment-1588671135

   Thanks @ffacs! Overall looks good.
   
   Could you please make all the CIs pass?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1213823442


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -235,10 +514,64 @@ namespace orc {
   DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
   DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
 
+  // Numeric to String/Char
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+  DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+  using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+  using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+  // Numeric to Decimal
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+  DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+  // Numeric to Timestamp
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+  DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
 #define CASE_CREATE_READER(TYPE, CONVERT) \
   case TYPE:                              \
     return std::make_unique<CONVERT##ColumnReader>(_readType, fileType, stripe, throwOnOverflow);
 
+  const static int32_t MAX_PRECISION_64 = 18;
+
+#define CASE_CREATE_DECIMAL_READER(FROM)                                                   \
+  case DECIMAL: {                                                                          \
+    if (_readType.getPrecision() <= MAX_PRECISION_64) {                                    \

Review Comment:
   Yes, but precision == 0 indicates it should use int128.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1218823977


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;

Review Comment:
   trueValue and falseValue can actually be const static variable



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {

Review Comment:
   ```suggestion
         if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) {
   ```



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {

Review Comment:
   ```suggestion
       } else if (readType.getKind() == CHAR) {
   ```
   And explicitly throw in the else branch



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,42 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale,

Review Comment:
   Please add some tests for this new function.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();

Review Comment:
   Though unlikely, an overflow check is good to be here and all additions below.



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFLoatingFileType>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+      scaleMultiplier = 1;
+      bool overflow = false;
+      upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+      for (int i = 0; i < scale; i++) {
+        scaleMultiplier *= 10;
+      }
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFLoatingFileType) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename SrcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {

Review Comment:
   Can we extract its core logic into Int128.hh and make sure it is covered by some tests?



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFLoatingFileType>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+      scaleMultiplier = 1;
+      bool overflow = false;
+      upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+      for (int i = 0; i < scale; i++) {
+        scaleMultiplier *= 10;
+      }
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFLoatingFileType) {

Review Comment:
   ```suggestion
             if constexpr (isFloatingFileType) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221855494


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: " +
+                                     std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, i, throwOnOverflow);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFLoatingFileType>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+      scaleMultiplier = 1;
+      bool overflow = false;
+      upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+      for (int i = 0; i < scale; i++) {
+        scaleMultiplier *= 10;
+      }
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFLoatingFileType) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename SrcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType value) {

Review Comment:
   > Can we extract its core logic into Int128.hh and make sure it is covered by some tests?
   
   done



##########
c++/src/Int128.cc:
##########
@@ -488,4 +488,42 @@ namespace orc {
     return value;
   }
 
+  std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale,

Review Comment:
   > Please add some tests for this new function.
   
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] wgtmac commented on pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "wgtmac (via GitHub)" <gi...@apache.org>.

wgtmac commented on PR #1500:
URL: https://github.com/apache/orc/pull/1500#issuecomment-1602897093

   Thanks @dongjoon-hyun!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] dongjoon-hyun commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "dongjoon-hyun (via GitHub)" <gi...@apache.org>.

dongjoon-hyun commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1238261627


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -257,169 +603,169 @@ namespace orc {
     switch (fileType.getKind()) {
       case BOOLEAN: {
         switch (_readType.getKind()) {
-          CASE_CREATE_READER(BYTE, BooleanToByte);
-          CASE_CREATE_READER(SHORT, BooleanToShort);
-          CASE_CREATE_READER(INT, BooleanToInt);
-          CASE_CREATE_READER(LONG, BooleanToLong);
-          CASE_CREATE_READER(FLOAT, BooleanToFloat);
-          CASE_CREATE_READER(DOUBLE, BooleanToDouble);
+          CASE_CREATE_READER(BYTE, BooleanToByte)
+          CASE_CREATE_READER(SHORT, BooleanToShort)
+          CASE_CREATE_READER(INT, BooleanToInt)
+          CASE_CREATE_READER(LONG, BooleanToLong)
+          CASE_CREATE_READER(FLOAT, BooleanToFloat)
+          CASE_CREATE_READER(DOUBLE, BooleanToDouble)
+          CASE_CREATE_READER(STRING, BooleanToString)
+          CASE_CREATE_READER(CHAR, BooleanToChar)
+          CASE_CREATE_READER(VARCHAR, BooleanToVarchar)
+          CASE_CREATE_DECIMAL_READER(Boolean)
+          CASE_CREATE_READER(TIMESTAMP, BooleanToTimestamp)
+          CASE_CREATE_READER(TIMESTAMP_INSTANT, BooleanToTimestamp)
           case BOOLEAN:
-          case STRING:
           case BINARY:
-          case TIMESTAMP:
           case LIST:
           case MAP:
           case STRUCT:
           case UNION:
-          case DECIMAL:
           case DATE:
-          case VARCHAR:
-          case CHAR:
-          case TIMESTAMP_INSTANT:
             CASE_EXCEPTION
         }
       }
       case BYTE: {
         switch (_readType.getKind()) {
-          CASE_CREATE_READER(BOOLEAN, ByteToBoolean);
-          CASE_CREATE_READER(SHORT, ByteToShort);
-          CASE_CREATE_READER(INT, ByteToInt);
-          CASE_CREATE_READER(LONG, ByteToLong);
-          CASE_CREATE_READER(FLOAT, ByteToFloat);
-          CASE_CREATE_READER(DOUBLE, ByteToDouble);
+          CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
+          CASE_CREATE_READER(SHORT, ByteToShort)
+          CASE_CREATE_READER(INT, ByteToInt)
+          CASE_CREATE_READER(LONG, ByteToLong)
+          CASE_CREATE_READER(FLOAT, ByteToFloat)
+          CASE_CREATE_READER(DOUBLE, ByteToDouble)
+          CASE_CREATE_READER(STRING, ByteToString)
+          CASE_CREATE_READER(CHAR, ByteToChar)
+          CASE_CREATE_READER(VARCHAR, ByteToVarchar)
+          CASE_CREATE_DECIMAL_READER(Byte)
+          CASE_CREATE_READER(TIMESTAMP, ByteToTimestamp)
+          CASE_CREATE_READER(TIMESTAMP_INSTANT, ByteToTimestamp)
           case BYTE:
-          case STRING:
           case BINARY:
-          case TIMESTAMP:
           case LIST:
           case MAP:
           case STRUCT:
           case UNION:
-          case DECIMAL:
           case DATE:
-          case VARCHAR:
-          case CHAR:
-          case TIMESTAMP_INSTANT:
             CASE_EXCEPTION
         }
       }
       case SHORT: {
         switch (_readType.getKind()) {
-          CASE_CREATE_READER(BOOLEAN, ShortToBoolean);
-          CASE_CREATE_READER(BYTE, ShortToByte);
-          CASE_CREATE_READER(INT, ShortToInt);
-          CASE_CREATE_READER(LONG, ShortToLong);
-          CASE_CREATE_READER(FLOAT, ShortToFloat);
-          CASE_CREATE_READER(DOUBLE, ShortToDouble);
+          CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
+          CASE_CREATE_READER(BYTE, ShortToByte)
+          CASE_CREATE_READER(INT, ShortToInt)
+          CASE_CREATE_READER(LONG, ShortToLong)
+          CASE_CREATE_READER(FLOAT, ShortToFloat)
+          CASE_CREATE_READER(DOUBLE, ShortToDouble)
+          CASE_CREATE_READER(STRING, ShortToString)
+          CASE_CREATE_READER(CHAR, ShortToChar)
+          CASE_CREATE_READER(VARCHAR, ShortToVarchar)
+          CASE_CREATE_DECIMAL_READER(Short)
+          CASE_CREATE_READER(TIMESTAMP, ShortToTimestamp)
+          CASE_CREATE_READER(TIMESTAMP_INSTANT, ShortToTimestamp)
           case SHORT:
-          case STRING:
           case BINARY:
-          case TIMESTAMP:
           case LIST:
           case MAP:
           case STRUCT:
           case UNION:
-          case DECIMAL:
           case DATE:
-          case VARCHAR:
-          case CHAR:
-          case TIMESTAMP_INSTANT:
             CASE_EXCEPTION
         }
       }
       case INT: {
         switch (_readType.getKind()) {
-          CASE_CREATE_READER(BOOLEAN, IntToBoolean);
-          CASE_CREATE_READER(BYTE, IntToByte);
-          CASE_CREATE_READER(SHORT, IntToShort);
-          CASE_CREATE_READER(LONG, IntToLong);
-          CASE_CREATE_READER(FLOAT, IntToFloat);
-          CASE_CREATE_READER(DOUBLE, IntToDouble);
+          CASE_CREATE_READER(BOOLEAN, IntToBoolean)
+          CASE_CREATE_READER(BYTE, IntToByte)
+          CASE_CREATE_READER(SHORT, IntToShort)
+          CASE_CREATE_READER(LONG, IntToLong)
+          CASE_CREATE_READER(FLOAT, IntToFloat)
+          CASE_CREATE_READER(DOUBLE, IntToDouble)
+          CASE_CREATE_READER(STRING, IntToString)
+          CASE_CREATE_READER(CHAR, IntToChar)
+          CASE_CREATE_READER(VARCHAR, IntToVarchar)
+          CASE_CREATE_DECIMAL_READER(Int)
+          CASE_CREATE_READER(TIMESTAMP, IntToTimestamp)
+          CASE_CREATE_READER(TIMESTAMP_INSTANT, IntToTimestamp)
           case INT:
-          case STRING:
           case BINARY:
-          case TIMESTAMP:
           case LIST:
           case MAP:
           case STRUCT:
           case UNION:
-          case DECIMAL:
           case DATE:
-          case VARCHAR:
-          case CHAR:
-          case TIMESTAMP_INSTANT:
             CASE_EXCEPTION
         }
       }
       case LONG: {
         switch (_readType.getKind()) {
-          CASE_CREATE_READER(BOOLEAN, LongToBoolean);

Review Comment:
   We need this semicolon removal in `branch-1.9` too.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Posted by "ffacs (via GitHub)" <gi...@apache.org>.

ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209461269


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  size_t BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& rowBatch,
+                                                                uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& fileType,
+                                       StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      std::string strValue = std::to_string(value);
+      int32_t fromScale = 0;
+      int32_t fromPrecision = static_cast<int32_t>(strValue.length());
+      Int128 i128 = 0;
+      for (size_t i = 0; i < strValue.length(); ++i) {
+        auto c = strValue[i];
+        if (c == '.') {
+          fromScale = static_cast<int32_t>(strValue.length() - i - 1);
+          fromPrecision -= 1;
+          continue;
+        }
+        i128 *= 10;
+        i128 += c - '0';
+      }
+      auto result = convertDecimal(i128, fromPrecision, fromScale, precision, scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    template <typename srcType>
+    void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType value) {
+      int fromScale = 0;
+      int fromPrecision = 1;
+      for (srcType tmp = value; tmp /= 10; ++fromPrecision)
+        ;
+      auto result = convertDecimal(value, fromPrecision, fromScale, precision, scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    int32_t precision;
+    int32_t scale;
+  };
+
+  class ConvertToTimestampColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+          readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? getTimezoneByName("GMT")
+                                                                 : stripe.getReaderTimezone()),
+          needConvertTimezone(&readerTimezone != &getTimezoneByName("GMT")) {}
+
+   protected:
+    const orc::Timezone& readerTimezone;
+    const bool needConvertTimezone;
+  };
+
+  template <typename FileTypeBatch>
+  class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader {
+   public:
+    NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool _throwOnOverflow)
+        : ConvertToTimestampColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override {
+      ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+        }
+      }
+    }
+
+   private:
+    template <typename FileType>
+    void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, FileType value);
+  };
+
+  template <typename FileTypeBatch>
+  template <typename FileType>
+  void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+      TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+    if constexpr (std::is_floating_point<FileType>::value) {
+      if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+          value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+        handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+        return;
+      }
+      dstBatch.data[idx] = static_cast<int64_t>(value);
+      dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+          static_cast<double>(value - static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+      if (dstBatch.nanoseconds[idx] < 0) {
+        dstBatch.data[idx] -= 1;
+        dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+      }
+    } else {
+      dstBatch.data[idx] = value;
+      dstBatch.nanoseconds[idx] = 0;
+    }
+    if (needConvertTimezone) {
+      dstBatch.data[idx] = readerTimezone.convertFromUTC(dstBatch.data[idx]);

Review Comment:
   > Does this follow the Java impl as well?
   
   https://github.com/apache/orc/blob/ec2ea9c6aff8b8515452df651f08695639c18cbb/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L1518-L1524
   
   please take a look



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@orc.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org