You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/02/02 03:21:36 UTC
[GitHub] [arrow] anthonylouisbsb commented on a change in pull request #12306: ARROW-15083: [C++][Gandiva] Implement Conv Function

anthonylouisbsb commented on a change in pull request #12306:
URL: https://github.com/apache/arrow/pull/12306#discussion_r797235221



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -683,6 +683,202 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
+GDV_FORCE_INLINE
+uint64_t unsigned_long_div(gdv_int64 x, gdv_int32 m) {
+  if (x >= 0) {
+    return x / m;
+  }
+  return x / m + 2 * (LONG_MAX / m) + 2 / m + (x % m + 2 * (LONG_MAX % m) + 2 % m) / m;
+}
+
+GDV_FORCE_INLINE
+gdv_int64 encode(gdv_int32 radix, gdv_int32 fromPos, const char* value,
+                 gdv_int32 valueLen) {
+  uint64_t val = 0;
+  uint64_t bound = unsigned_long_div(-1 - radix, radix);
+
+  for (int i = fromPos; i < valueLen && value[i] >= 0; i++) {
+    if (val >= bound) {
+      if (unsigned_long_div(-1 - value[i], radix) < val) {
+        return -1;
+      }
+    }
+    val = val * radix + value[i];
+  }
+  return val;
+}
+
+GDV_FORCE_INLINE
+void decode(gdv_int64 val, gdv_int32 radix, char* value, gdv_int32 valueLen) {
+  for (int i = 0; i < valueLen; i++) {
+    value[i] = static_cast<char>(0);
+  }
+
+  for (int i = valueLen - 1; val != 0; i--) {
+    gdv_int64 q = unsigned_long_div(val, radix);
+    value[i] = static_cast<char>((val - q * radix));
+    val = q;
+  }
+}
+
+GDV_FORCE_INLINE
+char character_for_digit(gdv_int32 value, gdv_int32 radix) {  // From Decimal to Any Base

Review comment:
       ```suggestion
   // From Decimal to Any Base
   GDV_FORCE_INLINE
   char character_for_digit(gdv_int32 value, gdv_int32 radix) {  
   ```

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -683,6 +683,202 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
+GDV_FORCE_INLINE
+uint64_t unsigned_long_div(gdv_int64 x, gdv_int32 m) {
+  if (x >= 0) {
+    return x / m;
+  }
+  return x / m + 2 * (LONG_MAX / m) + 2 / m + (x % m + 2 * (LONG_MAX % m) + 2 % m) / m;
+}
+
+GDV_FORCE_INLINE
+gdv_int64 encode(gdv_int32 radix, gdv_int32 fromPos, const char* value,
+                 gdv_int32 valueLen) {
+  uint64_t val = 0;
+  uint64_t bound = unsigned_long_div(-1 - radix, radix);
+
+  for (int i = fromPos; i < valueLen && value[i] >= 0; i++) {
+    if (val >= bound) {
+      if (unsigned_long_div(-1 - value[i], radix) < val) {
+        return -1;
+      }
+    }
+    val = val * radix + value[i];
+  }
+  return val;
+}
+
+GDV_FORCE_INLINE
+void decode(gdv_int64 val, gdv_int32 radix, char* value, gdv_int32 valueLen) {
+  for (int i = 0; i < valueLen; i++) {
+    value[i] = static_cast<char>(0);
+  }
+
+  for (int i = valueLen - 1; val != 0; i--) {
+    gdv_int64 q = unsigned_long_div(val, radix);
+    value[i] = static_cast<char>((val - q * radix));
+    val = q;
+  }
+}
+
+GDV_FORCE_INLINE
+char character_for_digit(gdv_int32 value, gdv_int32 radix) {  // From Decimal to Any Base
+  // This function is similar to Character.forDigit in Java
+  int digit = 0;
+  digit = value % radix;
+  if (digit < 10) {
+    return static_cast<char>(digit + '0');
+  } else {
+    return static_cast<char>(digit + 'A' - 10);
+  }
+}
+
+GDV_FORCE_INLINE
+gdv_int64 character_digit(char value, gdv_int32 radix) {  // From any base to Decimal
+  // This function is similar to Character.digit in Java
+  if ((radix <= 0) || (radix > 36)) {
+    return -1;
+  }
+
+  if (radix <= 10) {
+    if (value >= '0' && value < '0' + radix) {
+      return value - '0';
+    } else {
+      return -1;
+    }
+  } else if (value >= '0' && value <= '9') {
+    return value - '0';
+  } else if (value >= 'a' && value < 'a' + radix - 10) {
+    return value - 'a' + 10;
+  } else if (value >= 'A' && value < 'A' + radix - 10) {
+    return value - 'A' + 10;
+  }
+
+  return -1;
+}
+
+GDV_FORCE_INLINE
+void byte2char(gdv_int32 radix, gdv_int32 fromPos, char* value, gdv_int32 valueLen) {
+  for (int i = fromPos; i < valueLen; i++) {
+    value[i] = static_cast<char>(character_for_digit(value[i], radix));
+  }
+}
+
+GDV_FORCE_INLINE
+void char2byte(gdv_int32 radix, gdv_int32 fromPos, char* value, gdv_int32 valueLen) {
+  for (int i = fromPos; i < valueLen; i++) {
+    value[i] = static_cast<char>(character_digit(value[i], radix));
+  }
+}
+
+GANDIVA_EXPORT
+const char* conv_int64_int32_int32(gdv_int64 context, gdv_int64 in, gdv_int32 from_base,
+                                   gdv_int32 to_base, int32_t* out_len) {
+  std::string to_utf8 = std::to_string(in);
+  char* in_utf8 = &to_utf8[0];
+  gdv_int32 in_utf8_len = to_utf8.length();
+
+  return conv_utf8_int32_int32(context, in_utf8, in_utf8_len, from_base, to_base,
+                               out_len);
+}
+
+GANDIVA_EXPORT
+const char* conv_int32_int32_int32(gdv_int64 context, gdv_int32 in, gdv_int32 from_base,
+                                   gdv_int32 to_base, int32_t* out_len) {
+  std::string to_utf8 = std::to_string(in);
+  char* in_utf8 = &to_utf8[0];
+  gdv_int32 in_utf8_len = to_utf8.length();
+
+  return conv_utf8_int32_int32(context, in_utf8, in_utf8_len, from_base, to_base,
+                               out_len);
+}
+
+GANDIVA_EXPORT
+const char* conv_utf8_int32_int32(gdv_int64 context, const char* in, int32_t in_len,
+                                  gdv_int32 from_base, gdv_int32 to_base,
+                                  int32_t* out_len) {
+  if (in_len <= 0) {
+    out_len = 0;
+    return "";
+  }
+
+  gdv_int32 valueLen = 64;
+  char* value = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, valueLen));
+  char* num = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));
+
+  if (value == nullptr || num == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    out_len = 0;
+    return "";
+  }
+
+  int fromBs = from_base;
+  int toBs = to_base;
+
+  if (fromBs < std::numeric_limits<char>::min() ||
+      fromBs > std::numeric_limits<char>::max() ||
+      abs(toBs) < std::numeric_limits<char>::min() ||
+      abs(toBs) > std::numeric_limits<char>::max()) {
+    // Checking if the variable is in range limit
+    gdv_fn_context_set_error_msg(context,
+                                 "The numerical limit of this variable is out range");
+    *out_len = 0;
+    return "";
+  }

Review comment:
       Also, I think that does not exist these numerical bases: `1` and `0`

##########
File path: cpp/src/gandiva/gdv_function_stubs_test.cc
##########
@@ -949,4 +949,65 @@ TEST(TestGdvFnStubs, TestMaskLastN) {
   EXPECT_EQ(expected, std::string(result, out_len));
 }
 
+TEST(TestGdvFnStubs, TestConv) {
+  gandiva::ExecutionContext ctx;
+
+  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+  gdv_int32 out_len = 0;
+
+  const char* value = conv_utf8_int32_int32(ctx_ptr, "1000101101", 10, 2, 10, &out_len);
+  std::string out_value = std::string(value, out_len);
+  EXPECT_EQ(out_value, "557");
+
+  value = conv_utf8_int32_int32(ctx_ptr, "ffa", 3, 16, 10, &out_len);

Review comment:
       Another test is that if the user defines an invalid strings:
   ```
   123-4565
   aaaa@#aa
   ffe sdkl
   ```

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -683,6 +683,202 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
+GDV_FORCE_INLINE
+uint64_t unsigned_long_div(gdv_int64 x, gdv_int32 m) {
+  if (x >= 0) {
+    return x / m;
+  }
+  return x / m + 2 * (LONG_MAX / m) + 2 / m + (x % m + 2 * (LONG_MAX % m) + 2 % m) / m;
+}
+
+GDV_FORCE_INLINE
+gdv_int64 encode(gdv_int32 radix, gdv_int32 fromPos, const char* value,
+                 gdv_int32 valueLen) {
+  uint64_t val = 0;
+  uint64_t bound = unsigned_long_div(-1 - radix, radix);
+
+  for (int i = fromPos; i < valueLen && value[i] >= 0; i++) {
+    if (val >= bound) {
+      if (unsigned_long_div(-1 - value[i], radix) < val) {
+        return -1;
+      }
+    }
+    val = val * radix + value[i];
+  }
+  return val;
+}
+
+GDV_FORCE_INLINE
+void decode(gdv_int64 val, gdv_int32 radix, char* value, gdv_int32 valueLen) {
+  for (int i = 0; i < valueLen; i++) {
+    value[i] = static_cast<char>(0);
+  }
+
+  for (int i = valueLen - 1; val != 0; i--) {
+    gdv_int64 q = unsigned_long_div(val, radix);
+    value[i] = static_cast<char>((val - q * radix));
+    val = q;
+  }
+}
+
+GDV_FORCE_INLINE
+char character_for_digit(gdv_int32 value, gdv_int32 radix) {  // From Decimal to Any Base
+  // This function is similar to Character.forDigit in Java
+  int digit = 0;
+  digit = value % radix;
+  if (digit < 10) {
+    return static_cast<char>(digit + '0');
+  } else {
+    return static_cast<char>(digit + 'A' - 10);
+  }
+}
+
+GDV_FORCE_INLINE
+gdv_int64 character_digit(char value, gdv_int32 radix) {  // From any base to Decimal
+  // This function is similar to Character.digit in Java
+  if ((radix <= 0) || (radix > 36)) {
+    return -1;
+  }
+
+  if (radix <= 10) {
+    if (value >= '0' && value < '0' + radix) {
+      return value - '0';
+    } else {
+      return -1;
+    }
+  } else if (value >= '0' && value <= '9') {
+    return value - '0';
+  } else if (value >= 'a' && value < 'a' + radix - 10) {
+    return value - 'a' + 10;
+  } else if (value >= 'A' && value < 'A' + radix - 10) {
+    return value - 'A' + 10;
+  }
+
+  return -1;
+}
+
+GDV_FORCE_INLINE
+void byte2char(gdv_int32 radix, gdv_int32 fromPos, char* value, gdv_int32 valueLen) {
+  for (int i = fromPos; i < valueLen; i++) {
+    value[i] = static_cast<char>(character_for_digit(value[i], radix));
+  }
+}
+
+GDV_FORCE_INLINE
+void char2byte(gdv_int32 radix, gdv_int32 fromPos, char* value, gdv_int32 valueLen) {
+  for (int i = fromPos; i < valueLen; i++) {
+    value[i] = static_cast<char>(character_digit(value[i], radix));
+  }
+}
+
+GANDIVA_EXPORT
+const char* conv_int64_int32_int32(gdv_int64 context, gdv_int64 in, gdv_int32 from_base,
+                                   gdv_int32 to_base, int32_t* out_len) {
+  std::string to_utf8 = std::to_string(in);
+  char* in_utf8 = &to_utf8[0];
+  gdv_int32 in_utf8_len = to_utf8.length();
+
+  return conv_utf8_int32_int32(context, in_utf8, in_utf8_len, from_base, to_base,
+                               out_len);
+}
+
+GANDIVA_EXPORT
+const char* conv_int32_int32_int32(gdv_int64 context, gdv_int32 in, gdv_int32 from_base,
+                                   gdv_int32 to_base, int32_t* out_len) {
+  std::string to_utf8 = std::to_string(in);
+  char* in_utf8 = &to_utf8[0];
+  gdv_int32 in_utf8_len = to_utf8.length();
+
+  return conv_utf8_int32_int32(context, in_utf8, in_utf8_len, from_base, to_base,
+                               out_len);
+}
+
+GANDIVA_EXPORT
+const char* conv_utf8_int32_int32(gdv_int64 context, const char* in, int32_t in_len,
+                                  gdv_int32 from_base, gdv_int32 to_base,
+                                  int32_t* out_len) {
+  if (in_len <= 0) {
+    out_len = 0;
+    return "";
+  }
+
+  gdv_int32 valueLen = 64;
+  char* value = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, valueLen));
+  char* num = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));
+
+  if (value == nullptr || num == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    out_len = 0;
+    return "";
+  }
+
+  int fromBs = from_base;
+  int toBs = to_base;
+
+  if (fromBs < std::numeric_limits<char>::min() ||
+      fromBs > std::numeric_limits<char>::max() ||
+      abs(toBs) < std::numeric_limits<char>::min() ||
+      abs(toBs) > std::numeric_limits<char>::max()) {
+    // Checking if the variable is in range limit
+    gdv_fn_context_set_error_msg(context,
+                                 "The numerical limit of this variable is out range");
+    *out_len = 0;
+    return "";
+  }

Review comment:
       As the possible digits that we can use to define a number are `0-9`, `a-z` and `A-Z`, I think that the limit of the base is much lower than the integer limit

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -683,6 +683,202 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
+GDV_FORCE_INLINE
+uint64_t unsigned_long_div(gdv_int64 x, gdv_int32 m) {
+  if (x >= 0) {
+    return x / m;
+  }
+  return x / m + 2 * (LONG_MAX / m) + 2 / m + (x % m + 2 * (LONG_MAX % m) + 2 % m) / m;
+}
+
+GDV_FORCE_INLINE
+gdv_int64 encode(gdv_int32 radix, gdv_int32 fromPos, const char* value,
+                 gdv_int32 valueLen) {
+  uint64_t val = 0;
+  uint64_t bound = unsigned_long_div(-1 - radix, radix);
+
+  for (int i = fromPos; i < valueLen && value[i] >= 0; i++) {
+    if (val >= bound) {
+      if (unsigned_long_div(-1 - value[i], radix) < val) {
+        return -1;
+      }
+    }
+    val = val * radix + value[i];
+  }
+  return val;
+}
+
+GDV_FORCE_INLINE
+void decode(gdv_int64 val, gdv_int32 radix, char* value, gdv_int32 valueLen) {
+  for (int i = 0; i < valueLen; i++) {
+    value[i] = static_cast<char>(0);
+  }
+
+  for (int i = valueLen - 1; val != 0; i--) {
+    gdv_int64 q = unsigned_long_div(val, radix);
+    value[i] = static_cast<char>((val - q * radix));
+    val = q;
+  }
+}
+
+GDV_FORCE_INLINE
+char character_for_digit(gdv_int32 value, gdv_int32 radix) {  // From Decimal to Any Base
+  // This function is similar to Character.forDigit in Java
+  int digit = 0;
+  digit = value % radix;
+  if (digit < 10) {
+    return static_cast<char>(digit + '0');
+  } else {
+    return static_cast<char>(digit + 'A' - 10);
+  }
+}
+
+GDV_FORCE_INLINE
+gdv_int64 character_digit(char value, gdv_int32 radix) {  // From any base to Decimal

Review comment:
       ditto
   

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -683,6 +683,202 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
+GDV_FORCE_INLINE
+uint64_t unsigned_long_div(gdv_int64 x, gdv_int32 m) {
+  if (x >= 0) {
+    return x / m;
+  }
+  return x / m + 2 * (LONG_MAX / m) + 2 / m + (x % m + 2 * (LONG_MAX % m) + 2 % m) / m;
+}
+
+GDV_FORCE_INLINE
+gdv_int64 encode(gdv_int32 radix, gdv_int32 fromPos, const char* value,
+                 gdv_int32 valueLen) {
+  uint64_t val = 0;
+  uint64_t bound = unsigned_long_div(-1 - radix, radix);
+
+  for (int i = fromPos; i < valueLen && value[i] >= 0; i++) {
+    if (val >= bound) {
+      if (unsigned_long_div(-1 - value[i], radix) < val) {
+        return -1;
+      }
+    }
+    val = val * radix + value[i];
+  }
+  return val;
+}
+
+GDV_FORCE_INLINE
+void decode(gdv_int64 val, gdv_int32 radix, char* value, gdv_int32 valueLen) {
+  for (int i = 0; i < valueLen; i++) {
+    value[i] = static_cast<char>(0);
+  }
+
+  for (int i = valueLen - 1; val != 0; i--) {
+    gdv_int64 q = unsigned_long_div(val, radix);
+    value[i] = static_cast<char>((val - q * radix));
+    val = q;
+  }
+}
+
+GDV_FORCE_INLINE
+char character_for_digit(gdv_int32 value, gdv_int32 radix) {  // From Decimal to Any Base

Review comment:
       Place the comments above the function that you are describing, it is better to read than in the side

##########
File path: cpp/src/gandiva/gdv_function_stubs_test.cc
##########
@@ -949,4 +949,65 @@ TEST(TestGdvFnStubs, TestMaskLastN) {
   EXPECT_EQ(expected, std::string(result, out_len));
 }
 
+TEST(TestGdvFnStubs, TestConv) {
+  gandiva::ExecutionContext ctx;
+
+  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
+  gdv_int32 out_len = 0;
+
+  const char* value = conv_utf8_int32_int32(ctx_ptr, "1000101101", 10, 2, 10, &out_len);

Review comment:
       If the user defines a number with many zeros at the left of the string, the function will work fine?
   
   Example: "000000000001" in binary to decimal -> "1"
   Other example: "-000000001" in binary to decimal -> "-1"




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org