You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by gi...@apache.org on 2024/01/26 01:26:42 UTC
(arrow-nanoarrow) branch main updated: Update dist/ for commit c3871347dc02e31465c37e3bc61f743460128c88
This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new fc1c081c Update dist/ for commit c3871347dc02e31465c37e3bc61f743460128c88
fc1c081c is described below
commit fc1c081c4a2100bd1e68c24a817888b27d284a79
Author: GitHub Actions <ac...@github.com>
AuthorDate: Fri Jan 26 01:26:38 2024 +0000
Update dist/ for commit c3871347dc02e31465c37e3bc61f743460128c88
---
dist/nanoarrow.c | 222 +++++++++++++++++++++++++++++++++++++++++++++
dist/nanoarrow.h | 33 +++++++
dist/nanoarrow_testing.hpp | 68 ++++++++++++++
3 files changed, 323 insertions(+)
diff --git a/dist/nanoarrow.c b/dist/nanoarrow.c
index dc7ea6a1..e2fff10a 100644
--- a/dist/nanoarrow.c
+++ b/dist/nanoarrow.c
@@ -231,6 +231,205 @@ struct ArrowBufferAllocator ArrowBufferDeallocator(
allocator.private_data = private_data;
return allocator;
}
+
+static const int kInt32DecimalDigits = 9;
+
+static const uint64_t kUInt32PowersOfTen[] = {
+ 1ULL, 10ULL, 100ULL, 1000ULL, 10000ULL,
+ 100000ULL, 1000000ULL, 10000000ULL, 100000000ULL, 1000000000ULL};
+
+// Adapted from Arrow C++ to use 32-bit words for better C portability
+// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L524-L544
+static void ShiftAndAdd(struct ArrowStringView value, uint32_t* out, int64_t out_size) {
+ // We use strtoll for parsing, which needs input that is null-terminated
+ char chunk_string[16];
+
+ for (int64_t posn = 0; posn < value.size_bytes;) {
+ int64_t remaining = value.size_bytes - posn;
+
+ int64_t group_size;
+ if (remaining > kInt32DecimalDigits) {
+ group_size = kInt32DecimalDigits;
+ } else {
+ group_size = remaining;
+ }
+
+ const uint64_t multiple = kUInt32PowersOfTen[group_size];
+
+ memcpy(chunk_string, value.data + posn, group_size);
+ chunk_string[group_size] = '\0';
+ uint32_t chunk = (uint32_t)strtoll(chunk_string, NULL, 10);
+
+ for (int64_t i = 0; i < out_size; i++) {
+ uint64_t tmp = out[i];
+ tmp *= multiple;
+ tmp += chunk;
+ out[i] = (uint32_t)(tmp & 0xFFFFFFFFULL);
+ chunk = (uint32_t)(tmp >> 32);
+ }
+ posn += group_size;
+ }
+}
+
+ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal,
+ struct ArrowStringView value) {
+ // Check for sign
+ int is_negative = value.data[0] == '-';
+ int has_sign = is_negative || value.data[0] == '+';
+ value.data += has_sign;
+ value.size_bytes -= has_sign;
+
+ // Check all characters are digits that are not the negative sign
+ for (int64_t i = 0; i < value.size_bytes; i++) {
+ char c = value.data[i];
+ if (c < '0' || c > '9') {
+ return EINVAL;
+ }
+ }
+
+ // Skip over leading 0s
+ int64_t n_leading_zeroes = 0;
+ for (int64_t i = 0; i < value.size_bytes; i++) {
+ if (value.data[i] == '0') {
+ n_leading_zeroes++;
+ } else {
+ break;
+ }
+ }
+
+ value.data += n_leading_zeroes;
+ value.size_bytes -= n_leading_zeroes;
+
+ // Use 32-bit words for portability
+ uint32_t words32[8];
+ int n_words32 = decimal->n_words * 2;
+ NANOARROW_DCHECK(n_words32 <= 8);
+ memset(words32, 0, sizeof(words32));
+
+ ShiftAndAdd(value, words32, n_words32);
+
+ if (decimal->low_word_index == 0) {
+ memcpy(decimal->words, words32, sizeof(uint32_t) * n_words32);
+ } else {
+ uint64_t lo;
+ uint64_t hi;
+
+ for (int i = 0; i < decimal->n_words; i++) {
+ lo = (uint64_t)words32[i * 2];
+ hi = (uint64_t)words32[i * 2 + 1] << 32;
+ decimal->words[decimal->n_words - i - 1] = lo | hi;
+ }
+ }
+
+ if (is_negative) {
+ ArrowDecimalNegate(decimal);
+ }
+
+ return NANOARROW_OK;
+}
+
+// Adapted from Arrow C++ for C
+// https://github.com/apache/arrow/blob/cd3321b28b0c9703e5d7105d6146c1270bbadd7f/cpp/src/arrow/util/decimal.cc#L365
+ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal,
+ struct ArrowBuffer* buffer) {
+ int is_negative = ArrowDecimalSign(decimal) < 0;
+
+ uint64_t words_little_endian[4];
+ if (decimal->low_word_index == 0) {
+ memcpy(words_little_endian, decimal->words, decimal->n_words * sizeof(uint64_t));
+ } else {
+ for (int i = 0; i < decimal->n_words; i++) {
+ words_little_endian[i] = decimal->words[decimal->n_words - i - 1];
+ }
+ }
+
+ // We've already made a copy, so negate that if needed
+ if (is_negative) {
+ uint64_t carry = 1;
+ for (int i = 0; i < decimal->n_words; i++) {
+ uint64_t elem = words_little_endian[i];
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ words_little_endian[i] = elem;
+ }
+ }
+
+ // Find the most significant word that is non-zero
+ int most_significant_elem_idx = -1;
+ for (int i = decimal->n_words - 1; i >= 0; i--) {
+ if (words_little_endian[i] != 0) {
+ most_significant_elem_idx = i;
+ break;
+ }
+ }
+
+ // If they are all zero, the output is just '0'
+ if (most_significant_elem_idx == -1) {
+ NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt8(buffer, '0'));
+ return NANOARROW_OK;
+ }
+
+ // Define segments such that each segment represents 9 digits with the
+ // least significant group of 9 digits first. For example, if the input represents
+ // 9876543210123456789, then segments will be [123456789, 876543210, 9].
+ // We handle at most a signed 256 bit integer, whose maximum value occupies 77
+ // characters. Thus, we need at most 9 segments.
+ const uint32_t k1e9 = 1000000000U;
+ int num_segments = 0;
+ uint32_t segments[9];
+ memset(segments, 0, sizeof(segments));
+ uint64_t* most_significant_elem = words_little_endian + most_significant_elem_idx;
+
+ do {
+ // Compute remainder = words_little_endian % 1e9 and words_little_endian =
+ // words_little_endian / 1e9.
+ uint32_t remainder = 0;
+ uint64_t* elem = most_significant_elem;
+
+ do {
+ // Compute dividend = (remainder << 32) | *elem (a virtual 96-bit integer);
+ // *elem = dividend / 1e9;
+ // remainder = dividend % 1e9.
+ uint32_t hi = (uint32_t)(*elem >> 32);
+ uint32_t lo = (uint32_t)(*elem & 0xFFFFFFFFULL);
+ uint64_t dividend_hi = ((uint64_t)(remainder) << 32) | hi;
+ uint64_t quotient_hi = dividend_hi / k1e9;
+ remainder = (uint32_t)(dividend_hi % k1e9);
+ uint64_t dividend_lo = ((uint64_t)(remainder) << 32) | lo;
+ uint64_t quotient_lo = dividend_lo / k1e9;
+ remainder = (uint32_t)(dividend_lo % k1e9);
+
+ *elem = (quotient_hi << 32) | quotient_lo;
+ } while (elem-- != words_little_endian);
+
+ segments[num_segments++] = remainder;
+ } while (*most_significant_elem != 0 || most_significant_elem-- != words_little_endian);
+
+ // We know our output has no more than 9 digits per segment, plus a negative sign,
+ // plus any further digits between our output of 9 digits plus enough
+ // extra characters to ensure that snprintf() with n = 21 (maximum length of %lu
+ // including a the null terminator) is bounded properly.
+ NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(buffer, num_segments * 9 + 1 + 21 - 9));
+ if (is_negative) {
+ buffer->data[buffer->size_bytes++] = '-';
+ }
+
+ // The most significant segment should have no leading zeroes
+ int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%lu",
+ (unsigned long)segments[num_segments - 1]);
+ buffer->size_bytes += n_chars;
+
+ // Subsequent output needs to be left-padded with zeroes such that each segment
+ // takes up exactly 9 digits.
+ for (int i = num_segments - 2; i >= 0; i--) {
+ int n_chars = snprintf((char*)buffer->data + buffer->size_bytes, 21, "%09lu",
+ (unsigned long)segments[i]);
+ buffer->size_bytes += n_chars;
+ NANOARROW_DCHECK(buffer->size_bytes <= buffer->capacity_bytes);
+ }
+
+ return NANOARROW_OK;
+}
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
@@ -529,10 +728,33 @@ ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum Arrow
int n_chars;
switch (type) {
case NANOARROW_TYPE_TIME32:
+ if (timezone != NULL) {
+ return EINVAL;
+ }
+
+ switch (time_unit) {
+ case NANOARROW_TIME_UNIT_MICRO:
+ case NANOARROW_TIME_UNIT_NANO:
+ return EINVAL;
+ default:
+ break;
+ }
+
+ n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str);
+ break;
case NANOARROW_TYPE_TIME64:
if (timezone != NULL) {
return EINVAL;
}
+
+ switch (time_unit) {
+ case NANOARROW_TIME_UNIT_SECOND:
+ case NANOARROW_TIME_UNIT_MILLI:
+ return EINVAL;
+ default:
+ break;
+ }
+
n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str);
break;
case NANOARROW_TYPE_TIMESTAMP:
diff --git a/dist/nanoarrow.h b/dist/nanoarrow.h
index 331da298..a05c9b93 100644
--- a/dist/nanoarrow.h
+++ b/dist/nanoarrow.h
@@ -948,6 +948,28 @@ static inline void ArrowDecimalSetInt(struct ArrowDecimal* decimal, int64_t valu
decimal->words[decimal->low_word_index] = value;
}
+/// \brief Negate the value of this decimal in place
+/// \ingroup nanoarrow-utils
+static inline void ArrowDecimalNegate(struct ArrowDecimal* decimal) {
+ uint64_t carry = 1;
+
+ if (decimal->low_word_index == 0) {
+ for (int i = 0; i < decimal->n_words; i++) {
+ uint64_t elem = decimal->words[i];
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ decimal->words[i] = elem;
+ }
+ } else {
+ for (int i = decimal->low_word_index; i >= 0; i--) {
+ uint64_t elem = decimal->words[i];
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ decimal->words[i] = elem;
+ }
+ }
+}
+
/// \brief Copy bytes from a buffer into this decimal
/// \ingroup nanoarrow-utils
static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal,
@@ -1009,6 +1031,9 @@ static inline void ArrowDecimalSetBytes(struct ArrowDecimal* decimal,
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBufferDeallocator)
#define ArrowErrorSet NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowErrorSet)
#define ArrowLayoutInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowLayoutInit)
+#define ArrowDecimalSetDigits NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalSetDigits)
+#define ArrowDecimalAppendDigitsToBuffer \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowDecimalAppendDigitsToBuffer)
#define ArrowSchemaInit NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInit)
#define ArrowSchemaInitFromType \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowSchemaInitFromType)
@@ -1242,6 +1267,14 @@ void ArrowLayoutInit(struct ArrowLayout* layout, enum ArrowType storage_type);
/// \brief Create a string view from a null-terminated string
static inline struct ArrowStringView ArrowCharView(const char* value);
+/// \brief Sets the integer value of an ArrowDecimal from a string
+ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* decimal,
+ struct ArrowStringView value);
+
+/// \brief Get the integer value of an ArrowDecimal as string
+ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* decimal,
+ struct ArrowBuffer* buffer);
+
/// @}
/// \defgroup nanoarrow-schema Creating schemas
diff --git a/dist/nanoarrow_testing.hpp b/dist/nanoarrow_testing.hpp
index 0b434e8c..4e39ee09 100644
--- a/dist/nanoarrow_testing.hpp
+++ b/dist/nanoarrow_testing.hpp
@@ -850,6 +850,13 @@ class TestingJSONWriter {
break;
}
+ case NANOARROW_TYPE_DECIMAL128:
+ NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 128));
+ break;
+ case NANOARROW_TYPE_DECIMAL256:
+ NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 256));
+ break;
+
default:
// Not supported
return ENOTSUP;
@@ -935,6 +942,37 @@ class TestingJSONWriter {
}
}
+ ArrowErrorCode WriteDecimalData(std::ostream& out, const ArrowArrayView* view,
+ int bitwidth) {
+ ArrowDecimal value;
+ ArrowDecimalInit(&value, bitwidth, 0, 0);
+ nanoarrow::UniqueBuffer tmp;
+
+ NANOARROW_RETURN_NOT_OK(WriteDecimalMaybeNull(out, view, 0, &value, tmp.get()));
+ for (int64_t i = 1; i < view->length; i++) {
+ out << ", ";
+ NANOARROW_RETURN_NOT_OK(WriteDecimalMaybeNull(out, view, i, &value, tmp.get()));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteDecimalMaybeNull(std::ostream& out, const ArrowArrayView* view,
+ int64_t i, ArrowDecimal* decimal,
+ ArrowBuffer* tmp) {
+ if (ArrowArrayViewIsNull(view, i)) {
+ out << R"("0")";
+ return NANOARROW_OK;
+ } else {
+ ArrowArrayViewGetDecimalUnsafe(view, i, decimal);
+ tmp->size_bytes = 0;
+ NANOARROW_RETURN_NOT_OK(ArrowDecimalAppendDigitsToBuffer(decimal, tmp));
+ out << R"(")" << std::string(reinterpret_cast<char*>(tmp->data), tmp->size_bytes)
+ << R"(")";
+ return NANOARROW_OK;
+ }
+ }
+
void WriteString(std::ostream& out, ArrowStringView value) {
out << R"(")";
@@ -2115,6 +2153,10 @@ class TestingJSONReader {
return SetBufferIntervalDayTime(data, buffer, error);
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
return SetBufferIntervalMonthDayNano(data, buffer, error);
+ case NANOARROW_TYPE_DECIMAL128:
+ return SetBufferDecimal(data, buffer, 128, error);
+ case NANOARROW_TYPE_DECIMAL256:
+ return SetBufferDecimal(data, buffer, 256, error);
default:
ArrowErrorSet(error, "storage type %s DATA buffer not supported",
ArrowTypeString(array_view->storage_type));
@@ -2379,6 +2421,32 @@ class TestingJSONReader {
return NANOARROW_OK;
}
+ ArrowErrorCode SetBufferDecimal(const json& value, ArrowBuffer* buffer, int bitwidth,
+ ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.is_array(), error, "decimal buffer must be array"));
+
+ ArrowDecimal decimal;
+ ArrowDecimalInit(&decimal, bitwidth, 0, 0);
+
+ ArrowStringView item_view;
+
+ for (const auto& item : value) {
+ NANOARROW_RETURN_NOT_OK(
+ Check(item.is_string(), error, "decimal buffer item must be string"));
+ auto item_str = item.get<std::string>();
+ item_view.data = item_str.data();
+ item_view.size_bytes = item_str.size();
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowDecimalSetDigits(&decimal, item_view),
+ error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowBufferAppend(buffer, decimal.words, decimal.n_words * sizeof(uint64_t)),
+ error);
+ }
+
+ return NANOARROW_OK;
+ }
+
void SetArrayAllocatorRecursive(ArrowArray* array) {
for (int i = 0; i < array->n_buffers; i++) {
ArrowArrayBuffer(array, i)->allocator = allocator_;