You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/15 11:02:55 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7434: ARROW-9131: [C++] Faster ascii_lower and ascii_upper.

pitrou commented on a change in pull request #7434:
URL: https://github.com/apache/arrow/pull/7434#discussion_r440095501



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -64,77 +64,30 @@ void StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
   }
 }
 
-// Generated with
-//
-// print("static constexpr uint8_t kAsciiUpperTable[] = {")
-// for i in range(256):
-//     if i > 0: print(', ', end='')
-//     if i >= ord('a') and i <= ord('z'):
-//         print(i - 32, end='')
-//     else:
-//         print(i, end='')
-// print("};")
-
-static constexpr uint8_t kAsciiUpperTable[] = {
-    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,
-    16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-    32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-    48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-    64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-    80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-    96,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-    80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  123, 124, 125, 126, 127,
-    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
-    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
-
 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
   for (int64_t i = 0; i < length; ++i) {
-    *output++ = kAsciiUpperTable[*input++];
+    const uint8_t utf8_code_unit = *input++;
+    // Code units in the range [a-z] can only be an encoding of an ascii
+    // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
+    // codepoint. This guaranteed by non-overal design of the unicode standard. (see

Review comment:
       "non-overlap"




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org