You are viewing a plain text version of this content. The canonical link for it is here.

Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/05/13 19:18:27 UTC

[GitHub] [arrow] nirandaperera opened a new pull request #10317: ARROW-12713 [C++] String reverse kernel

nirandaperera opened a new pull request #10317:
URL: https://github.com/apache/arrow/pull/10317


   This PR adds ascii and utf8 reverse kernels. 
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#issuecomment-845942199


   @lidavidm I think this looks okay now. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632624475



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;

Review comment:
       Use `bool` instead.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       This is potentially reading out of bounds if there is a truncated utf8 character at the end of the input.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       This deserves to be an inline function in `util/utf8.h`, I think (perhaps it even already exists?).
   
   

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));
+      i += offset;

Review comment:
       There's a problem: `offset` can be 0 in the lookup table, and then you have an infinite loop.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,25 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Reverse) {
+  this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+  this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", this->type(),
+                   "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]");
+}

Review comment:
       Can you add an example of invalid UTF8? We don't care that it produces useful results, but it should not crash or access memory out of bounds.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,25 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));

Review comment:
       Please test the error return more precisely, instead of only checking that it's not ok.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#issuecomment-840976553


   Learned from @cyb70289 that new kernels need to extend both C++ and Python documentation:
   1. https://github.com/apache/arrow/blob/master/docs/source/cpp/compute.rst
   2. https://github.com/apache/arrow/blob/master/docs/source/python/api/compute.rst
   Note that kernel names are in alphabetical order.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633767010



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2317,6 +2372,21 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ASCII input",
+    ("For each ASCII string in `strings`, return a reversed version.\n\n"
+     "This function assumes the input is fully ASCII.  If it may contain\n"
+     "non-ASCII characters, use \"utf8_reverse\" instead."),
+    {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+    "Reverse utf8 input",
+    ("For each utf8 string in `strings`, return a reversed version.\n\n"
+     "This function works on UTF8 codepoint level. As a consequence, if the input \n"
+     "contains combining UTF8 character sequences/ 'graphemes', they would also \n"
+     "be reversed."),

Review comment:
       Just to reword things a bit for clarity perhaps:
   
   > Reverse input
   > For each string in `strings`, return a reversed version.
   > This function operates on codepoints/UTF-8 code units, not grapheme clusters. Hence, it will not correctly reverse grapheme clusters composed of multiple codepoints.
   
   As we assume strings are UTF-8 and don't need to repeat that.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       I don't think we need to explicitly validate. Note the user themselves can call ValidateFull() which IIRC will also check for UTF-8.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +271,56 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }

Review comment:
       Nit: can we be consistent with `ASCII`?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -435,47 +435,59 @@ String transforms
 +==========================+============+=========================+=====================+=========+=======================================+
 | ascii_lower              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| ascii_reverse            | Unary      | String-like             | String-like         | \(2)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 | ascii_upper              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(2)    |                                       |
+| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(3)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring        | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring        | Unary      | String-like             | String-like         | \(3)    | :struct:`ReplaceSubstringOptions`     |
+| replace_substring_regex  | Unary      | String-like             | String-like         | \(5)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring_regex  | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
+| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(6)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(5)    |                                       |
+| utf8_lower               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_lower               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_reverse             | Unary      | String-like             | String-like         | \(8)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_upper               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_upper               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 
 
 * \(1) Each ASCII character in the input is converted to lowercase or
   uppercase.  Non-ASCII characters are left untouched.
 
-* \(2) Output is the physical length in bytes of each input element.  Output
+* \(2) ASCII input is reversed to the output. If non-ASCII characters 
+  are present, ``Invalid`` :class:`Status` will be returned.
+
+* \(3) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
 
-* \(3) Replace non-overlapping substrings that match to
+* \(4) Replace non-overlapping substrings that match to
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left.
 
-* \(4) Replace non-overlapping substrings that match to the regular expression
+* \(5) Replace non-overlapping substrings that match to the regular expression
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left. Note that if the
   pattern contains groups, backreferencing can be used.
 
-* \(5) Output is the number of characters (not bytes) of each input element.
+* \(6) Output is the number of characters (not bytes) of each input element.
   Output type is Int32 for String, Int64 for LargeString. 
 
-* \(6) Each UTF8-encoded character in the input is converted to lowercase or
+* \(7) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
+* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters 
+  are present, the Output would be undefined (but the size of Output buffers would
+  be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), 
+  they would also be reversed. 
+

Review comment:
       ```suggestion
   * \(8) Each UTF8-encoded code unit is written in reverse order to the output.
     If the input is not valid UTF8, then the output is undefined (but the size of output
     buffers will be preserved).
   
   ```

##########
File path: docs/source/cpp/compute.rst
##########
@@ -435,47 +435,59 @@ String transforms
 +==========================+============+=========================+=====================+=========+=======================================+
 | ascii_lower              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| ascii_reverse            | Unary      | String-like             | String-like         | \(2)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 | ascii_upper              | Unary      | String-like             | String-like         | \(1)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(2)    |                                       |
+| binary_length            | Unary      | Binary- or String-like  | Int32 or Int64      | \(3)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring        | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring        | Unary      | String-like             | String-like         | \(3)    | :struct:`ReplaceSubstringOptions`     |
+| replace_substring_regex  | Unary      | String-like             | String-like         | \(5)    | :struct:`ReplaceSubstringOptions`     |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| replace_substring_regex  | Unary      | String-like             | String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
+| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(6)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_length              | Unary      | String-like             | Int32 or Int64      | \(5)    |                                       |
+| utf8_lower               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_lower               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_reverse             | Unary      | String-like             | String-like         | \(8)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
-| utf8_upper               | Unary      | String-like             | String-like         | \(6)    |                                       |
+| utf8_upper               | Unary      | String-like             | String-like         | \(7)    |                                       |
 +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 
 
 * \(1) Each ASCII character in the input is converted to lowercase or
   uppercase.  Non-ASCII characters are left untouched.
 
-* \(2) Output is the physical length in bytes of each input element.  Output
+* \(2) ASCII input is reversed to the output. If non-ASCII characters 
+  are present, ``Invalid`` :class:`Status` will be returned.
+
+* \(3) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
 
-* \(3) Replace non-overlapping substrings that match to
+* \(4) Replace non-overlapping substrings that match to
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left.
 
-* \(4) Replace non-overlapping substrings that match to the regular expression
+* \(5) Replace non-overlapping substrings that match to the regular expression
   :member:`ReplaceSubstringOptions::pattern` by
   :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
   :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
   maximum number of replacements made, counting from the left. Note that if the
   pattern contains groups, backreferencing can be used.
 
-* \(5) Output is the number of characters (not bytes) of each input element.
+* \(6) Output is the number of characters (not bytes) of each input element.
   Output type is Int32 for String, Int64 for LargeString. 
 
-* \(6) Each UTF8-encoded character in the input is converted to lowercase or
+* \(7) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
+* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters 
+  are present, the Output would be undefined (but the size of Output buffers would
+  be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), 
+  they would also be reversed. 
+

Review comment:
       I don't think we need to try to explain the subtleties of grapheme clusters, just note that this operates on codepoints/code units, not grapheme clusters. Proper Unicode handling could fill an entire book.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632629009



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       We don't really care about making error cases fast.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632624306



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       I'm ok with the check.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632631612



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       If the ASCII version checks for invalid characters, does this means that the UTF8 version should also validate its input? Not sure if this is the case for all the other string kernels, but I think having a consistent behavior is desired.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633855837



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +271,56 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }

Review comment:
       I am reusing the `StringTransform` class for both ascii and utf8 kernels. In that, it always throws a `Status::Invalid("Invalid UTF8 sequence in input")` which is not quite right for an ascii kernel. That's why it was generalized and offloaded to `Derived::InvalidStatus()` :slightly_smiling_face: 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632636762



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       The ASCII version would produce invalid output for valid (but non-ASCII) input, which is why it checks for ASCII validity. Invalid input is a different situation which we don't need to check for.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632635288



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       I see. In that case, we might have to do something like this.
   ```
   offset_type stride = std::min(i+offset, input_string_ncodeunits);
   std::copy(input + i, input + stride, output + (input_string_ncodeunits - stride));
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       I see. In that case, we might have to do something like this.
   ```c++
   offset_type stride = std::min(i+offset, input_string_ncodeunits);
   std::copy(input + i, input + stride, output + (input_string_ncodeunits - stride));
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r636098185



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +271,56 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }

Review comment:
       Just to clarify here, the structure of the kernel means the 'implementation' doesn't directly return an error, only true/false. Would you prefer to change it so that it returns a status instead?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632626972



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       Consider the case where you have a long string, would it be better to have the check inside the loop so that it stops as soon as an invalid character is found?  Perhaps using `ARROW_PREDICT_FALSE`. I understand a conditional check in a loop may break any compiler vectorization, so I would suggest verifying if this is the case.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r637038217



##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,31 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  testing::HasSubstr("Non-ASCII sequence in input"),
+                                  CallFunction("ascii_reverse", {invalid_input}));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Reverse) {
+  this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+  this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])",
+                   this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");
+
+  // inputs with malformed utf8 chars would produce garbage output, but the end result
+  // would produce arrays with same lengths. Hence checking offset buffer equality
+  auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
+  const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
+  ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));

Review comment:
       Just a nit, but FTR we probably have a `AssertBufferEquals` (or perhaps `AssertBuffersEqual` :-)).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632632940



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;

Review comment:
       I used a `uint8_t` because `input[i] & 0x80` generates a `0x80`. Should I add something like
   `utf8_char_found |= (input[i] & 0x80 != 0);`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632630010



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       Good eye @pitrou. Possibly add a check to ensure `input + i + offset` does not goes out-of-bounds before trying to copy. An alternative would be to add an inner loop to traverse the codepoints in a UTF8 character, but this may be less performant than using the LUT.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633820075



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       I was referring to `std::min` vs the bitwise operation. but ended up using the `std::min`. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#issuecomment-842592009


   Remember to update Python documentation as well, https://github.com/apache/arrow/blob/master/docs/source/python/api/compute.rst


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632648238



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       We can also use this bit twiddling hack for `min`. 
   ```c
   r = y ^ ((x ^ y) & -(x < y)); // min(x, y)
   ```
   ref: https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
   
   Any preference?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       As of this implementation, yes, I am returning `true` on both cases. 
   
   Based on @pitrou 's comment, what I understood was, if there are invalid utf8 chars, we'll guarantee that the program runs correctly, but the output would be garbage. 
   
   I can actually change the logic to validate utf8 bytes etc, but then AFAIU, I'll have to drop the lookup table and count the valid utf8 bytes when ever I encounter a non-ascii byte. 
   
   If everyone agrees, I'll add that :slightly_smiling_face: 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       I found a problem in the LUT tbh :smile: `xFF` is a malformed utf8 byte, but that would mess everything up! :confused: So, looks like I'll have the 5th bit also into the LUT. 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),

Review comment:
       +1. I also agree that this is a weird error str to throw from an ascii kernel. 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       Final LUT would look like this. 
   |    | bit7 | bit6 | bit5 | bit4 | bit3 | char byte size | notes    |
   |----|------|------|------|------|------|----------------|----------|
   | 0  | 0    | 0    | 0    | 0    | 0    | 1              | ascii    |
   | 1  | 0    | 0    | 0    | 0    | 1    | 1              | ascii    |
   | 2  | 0    | 0    | 0    | 1    | 0    | 1              | ascii    |
   | 3  | 0    | 0    | 0    | 1    | 1    | 1              | ascii    |
   | 4  | 0    | 0    | 1    | 0    | 0    | 1              | ascii    |
   | 5  | 0    | 0    | 1    | 0    | 1    | 1              | ascii    |
   | 6  | 0    | 0    | 1    | 1    | 0    | 1              | ascii    |
   | 7  | 0    | 0    | 1    | 1    | 1    | 1              | ascii    |
   | 8  | 0    | 1    | 0    | 0    | 0    | 1              | ascii    |
   | 9  | 0    | 1    | 0    | 0    | 1    | 1              | ascii    |
   | 10 | 0    | 1    | 0    | 1    | 0    | 1              | ascii    |
   | 11 | 0    | 1    | 0    | 1    | 1    | 1              | ascii    |
   | 12 | 0    | 1    | 1    | 0    | 0    | 1              | ascii    |
   | 13 | 0    | 1    | 1    | 0    | 1    | 1              | ascii    |
   | 14 | 0    | 1    | 1    | 1    | 0    | 1              | ascii    |
   | 15 | 0    | 1    | 1    | 1    | 1    | 1              | ascii    |
   | 16 | 1    | 0    | 0    | 0    | 0    | 1              | internal |
   | 17 | 1    | 0    | 0    | 0    | 1    | 1              | internal |
   | 18 | 1    | 0    | 0    | 1    | 0    | 1              | internal |
   | 19 | 1    | 0    | 0    | 1    | 1    | 1              | internal |
   | 20 | 1    | 0    | 1    | 0    | 0    | 1              | internal |
   | 21 | 1    | 0    | 1    | 0    | 1    | 1              | internal |
   | 22 | 1    | 0    | 1    | 1    | 0    | 1              | internal |
   | 23 | 1    | 0    | 1    | 1    | 1    | 1              | internal |
   | 24 | 1    | 1    | 0    | 0    | 0    | 2              |          |
   | 25 | 1    | 1    | 0    | 0    | 1    | 2              |          |
   | 26 | 1    | 1    | 0    | 1    | 0    | 2              |          |
   | 27 | 1    | 1    | 0    | 1    | 1    | 2              |          |
   | 28 | 1    | 1    | 1    | 0    | 0    | 3              |          |
   | 29 | 1    | 1    | 1    | 0    | 1    | 3              |          |
   | 30 | 1    | 1    | 1    | 1    | 0    | 4              |          |
   | 31 | 1    | 1    | 1    | 1    | 1    | 1              | invalid  |




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633832364



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +271,56 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }

Review comment:
       I do not think you should add *InvalidStatus* methods with specific messages that do not necessarily generalize for all string errors. Why not directly invoke `Status::Invalid("Specific message to specific code block")`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +271,56 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+
+  static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); }

Review comment:
       I do not think you should add `InvalidStatus` methods with specific messages that do not necessarily generalize for all string errors. Why not directly invoke `Status::Invalid("Specific message to specific code block")`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

kiszk commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633107773



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       just curiosity: If we have all pure ascii chars, do we return true?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       just curiosity:
   - If we have all pure ascii chars, do we return true?
   - If we have an invalid UTF-8 (e.g. `0x1111xxxx 0xyyyyyyyy 0xzzzzzzzz`), do we return true?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633518422



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2317,6 +2365,14 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ascii input",
+    ("For each ascii string in `strings`, return a reversed version."), {"strings"});

Review comment:
       For consistency, can we include a message similar to the other kernels about UTF-8 support? e.g.
   
   ```
        "This function assumes the input is fully ASCII.  It it may contain\n"
        "non-ASCII characters, use \"utf8_reverse\" instead."),
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2317,6 +2365,14 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ascii input",
+    ("For each ascii string in `strings`, return a reversed version."), {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+    "Reverse utf8 input",
+    ("For each utf8 string in `strings`, return a reversed version."), {"strings"});

Review comment:
       Should we clarify that the reversing is done at the codepoint level, and not the grapheme cluster level? (So people would be aware that this kernel can't handle an é encoded as e + ´, for instance, or a flag emoji.)

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),

Review comment:
       I know this is encoded into the base transform class, but this is rather a confusing error message for an ASCII kernel.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),

Review comment:
       `StringTransform` already makes calls to the concrete kernel via `Derived::Foo` calls, perhaps constructing the Status could be made into such a call.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632640163



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));
+      i += offset;

Review comment:
       Ah. Good point. so, if there is a malformed byte (`0b10xxxxxx`) outside the encoding, this could happen. I think, I should change the LUT to have 1 instead of 0, isnt it? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633780801



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       Preference about what?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

github-actions[bot] commented on pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#issuecomment-840776704


   https://issues.apache.org/jira/browse/ARROW-12713


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632638764



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       I actually couldn't find it there. I can actually add an inline function like this
   ```c++
   inline uint8_t Utf8CharByteWidth(const uint8 *char_ptr);
   ```
   WDYT?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r636190660



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632797486



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       There is `ARROW_FORCE_INLINE` in "util/macros.h"




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r636097341



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       Sorry @nirandaperera do you want to take Antoine's suggestion and move this to utf8.h as a `static inline` function? Also, please note in the docstring that it considers \xFF as the start of a 4-byte code unit but that this is intentional as this is an invalid byte sequence.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

edponce commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632639816



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       Thanks for the clarification.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632123654



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       IMO `ascii_reverse` is slightly different from the other `ascii_*` routines. If there are non-ascii chars present, they can bypass them (ex: lower, upper, etc), but in reverse, we can't do that. Therefore, I added a flag `utf8_char_found` inside the loop, and if any utf8 char is found, the `ascii_reverse` would throw an invalid status. 
   I am not sure, if this check is needed or not. If we can assume that the user passes an 'ascii-only' input, we can simply user `std::reverse_copy`. So, I'd like to get a second opinion on that. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#issuecomment-845536089


   The formatting needs to be fixed here (see the logs for the Dev action), the other CI failures look unrelated.
   
   You could try `@github-actions autotune` to do that for you.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632615943



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       IMO if the check is relatively cheap then it's good to keep. If someone is using the ASCII kernel explicitly, presumably either
   1) they want an assurance their data is ASCII
   2) they know data is ASCII and assume it's faster to reverse knowing this




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

kiszk commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633107773



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       just curiosity: If we have all pure ascii chars, do we return true?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm closed pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm closed pull request #10317:
URL: https://github.com/apache/arrow/pull/10317


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] kiszk commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

kiszk commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633107773



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,54 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+ * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length
+ * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length
+ * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars
+ * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 1, 1, 1, 1, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];
+      offset_type stride = std::min(i + offset, input_string_ncodeunits);
+      std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
+      i += offset;
+    }
+    *output_written = input_string_ncodeunits;
+    return true;

Review comment:
       just curiosity:
   - If we have all pure ascii chars, do we return true?
   - If we have an invalid UTF-8 (e.g. `0x1111xxxx 0xyyyyyyyy 0xzzzzzzzz`), do we return true?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632635689



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;

Review comment:
       Ah, I see. No, it's ok.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nirandaperera commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

nirandaperera commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632123654



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;

Review comment:
       IMO `ascii_reverse` is slightly different from the other `ascii_*` routines. If there are non-ascii chars present, other routines can bypass them (ex: lower, upper, etc), but in reverse, we can't do that. Therefore, I added a flag `utf8_char_found` inside the loop, and if any utf8 char is found, the `ascii_reverse` would throw an invalid status. 
   I am not sure, if this check is needed or not. If we can assume that the user passes an 'ascii-only' input, we can simply user `std::reverse_copy`. So, I'd like to get a second opinion on that. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] cyb70289 commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

cyb70289 commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632245699



##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,25 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]");
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Reverse) {
+  this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+  this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(),

Review comment:
       Can we write `R"(["aAazZæÆ&", null, "", "bbb"])"` instead?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r634461566



##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,31 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  testing::HasSubstr("Non-ascii sequence in input"),

Review comment:
       ```suggestion
                                     testing::HasSubstr("Non-ASCII sequence in input"),
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633779485



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       If \xFF is invalid, then it's probably ok either way if it skips 4 bytes or one, because the input is invalid and we don't care about the end result so long as we don't crash or invoke undefined behavior.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] lidavidm commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Posted by GitBox <gi...@apache.org>.

lidavidm commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r633518422



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2317,6 +2365,14 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ascii input",
+    ("For each ascii string in `strings`, return a reversed version."), {"strings"});

Review comment:
       For consistency, can we include a message similar to the other kernels about UTF-8 support? e.g.
   
   ```
        "This function assumes the input is fully ASCII.  It it may contain\n"
        "non-ASCII characters, use \"utf8_reverse\" instead."),
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2317,6 +2365,14 @@ const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});
 
+const FunctionDoc ascii_reverse_doc(
+    "Reverse ascii input",
+    ("For each ascii string in `strings`, return a reversed version."), {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+    "Reverse utf8 input",
+    ("For each utf8 string in `strings`, return a reversed version."), {"strings"});

Review comment:
       Should we clarify that the reversing is done at the codepoint level, and not the grapheme cluster level? (So people would be aware that this kernel can't handle an é encoded as e + ´, for instance, or a flag emoji.)

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),

Review comment:
       I know this is encoded into the base transform class, but this is rather a confusing error message for an ASCII kernel.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,30 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"),

Review comment:
       `StringTransform` already makes calls to the concrete kernel via `Derived::Foo` calls, perhaps constructing the Status could be made into such a call.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org