You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/05/12 10:54:56 UTC

[GitHub] [arrow] projjal commented on a change in pull request #10195: ARROW-12595: [C++][Gandiva] Implement TO_HEX([binary] field) and FROM_HEX([string]field] functions

projjal commented on a change in pull request #10195:
URL: https://github.com/apache/arrow/pull/10195#discussion_r630911301



##########
File path: cpp/src/gandiva/function_registry_string.cc
##########
@@ -236,6 +236,12 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
       NativeFunction("binary_string", {}, DataTypeVector{utf8()}, binary(),
                      kResultNullIfNull, "binary_string", NativeFunction::kNeedsContext),
 
+      NativeFunction("to_hex", {}, DataTypeVector{binary()}, utf8(), kResultNullIfNull,
+                     "to_hex_binary", NativeFunction::kNeedsContext),
+
+      NativeFunction("from_hex", {}, DataTypeVector{utf8()}, binary(), kResultNullIfNull,
+                     "from_hex", NativeFunction::kNeedsContext),

Review comment:
       nit
   > from_hex_utf8

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1520,4 +1521,83 @@ const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_le
   return ret;
 }
 
+// Gets a binary object and returns its hexadecimal representation. That representation
+// maps each byte in the input to a 2-length string containing a hexadecimal number.
+// - Examples:
+//     - foo -> 666F6F = 66[f] 6F[o] 6F[o]
+//     - bar -> 626172 = 62[b] 61[a] 72[r]
+FORCE_INLINE
+const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
+                          int32_t* out_len) {
+  if (text_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  auto ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 2));
+
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  uint32_t ret_index = 0;
+  uint32_t max_len = static_cast<uint32_t>(text_len) * 2;
+  uint32_t max_char_to_write = 2 * max_len + 1;
+
+  for (gdv_int32 i = 0; i < text_len; i++) {
+    DCHECK(ret_index >= 0 && ret_index < max_len);
+
+    int32_t ch = static_cast<int32_t>(text[i]) & 0xFF;
+
+    ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch);
+  }
+
+  *out_len = static_cast<int32_t>(ret_index);
+  return ret;
+}
+
+FORCE_INLINE
+const char* from_hex(int64_t context, const char* text, int32_t text_len,
+                     int32_t* out_len) {
+  if (text_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // the input string should have a length multiple of two
+  if (text_len % 2 != 0) {
+    gdv_fn_context_set_error_msg(
+        context, "Error parsing hex string, length was not a multiple of two.");
+    *out_len = 0;
+    return "";
+  }
+
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len));

Review comment:
       text_len/2?

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1520,4 +1521,42 @@ const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_le
   return ret;
 }
 
+// Gets a binary object and returns its hexadecimal representation. That representation
+// maps each byte in the input to a 2-length string containing a hexadecimal number.
+// - Examples:
+//     - foo -> 666F6F = 66[f] 6F[o] 6F[o]
+//     - bar -> 626172 = 62[b] 61[a] 72[r]
+FORCE_INLINE
+const char* to_hex_binary(gdv_int64 context, const char* text, gdv_int32 text_len,
+                          gdv_int32* out_len) {
+  if (text_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  auto ret =
+      reinterpret_cast<gdv_utf8>(gdv_fn_context_arena_malloc(context, text_len * 2));
+
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  gdv_uint32 ret_index = 0;
+  gdv_uint32 max_len = static_cast<gdv_uint32>(text_len) * 2;
+  gdv_uint32 max_char_to_write = 4;
+
+  for (gdv_int32 i = 0; i < text_len; i++) {
+    DCHECK(ret_index >= 0 && ret_index < max_len);
+
+    gdv_int32 ch = static_cast<gdv_int32>(text[i]) & 0xFF;
+
+    ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch);

Review comment:
       Why set 2 * max_len + 1 here? I meant allocate 2 * max_len + 1  sized buffer or else you will be writing past the buffer.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org