You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2019/08/01 20:50:53 UTC
[incubator-iceberg] branch master updated: Fix truncateStringMax in
UnicodeUtil (#334)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 30d45f8 Fix truncateStringMax in UnicodeUtil (#334)
30d45f8 is described below
commit 30d45f88d1f10afdef0884b89860e6fd1f25365d
Author: Vinitha Gankidi <vg...@netflix.com>
AuthorDate: Tue Jul 30 19:18:59 2019 -0700
Fix truncateStringMax in UnicodeUtil (#334)
Fixes #328, fixes #329.
Index to codePointAt should be the offset calculated by code points
---
api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java | 6 +++---
.../test/java/org/apache/iceberg/TestMetricsTruncation.java | 11 +++++++++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
index 1eaed21..f76ec73 100644
--- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
+++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -79,11 +79,11 @@ public class UnicodeUtil {
// Try incrementing the code points from the end
for (int i = length - 1; i >= 0; i--) {
- int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1;
+ // Get the offset in the truncated string buffer where the number of unicode characters = i
+ int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
+ int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) + 1;
// No overflow
if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
- // Get the offset in the truncated string buffer where the number of unicode characters = i
- int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
truncatedStringBuffer.setLength(offsetByCodePoint);
// Append next code point to the truncated substring
truncatedStringBuffer.appendCodePoint(nextCodePoint);
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
index 7a99904..af304da 100644
--- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
+++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -139,6 +139,9 @@ public class TestMetricsTruncation {
String test6 = "\uD800\uDFFF\uD800\uDFFF";
// Increment the previous character
String test6_2_expected = "\uD801\uDC00";
+ String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
+ String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
+ String test7_1_expected = "\uD83D\uDE03";
Comparator<CharSequence> cmp = Literal.of(test1).comparator();
Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
@@ -176,5 +179,13 @@ public class TestMetricsTruncation {
Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have one character with " +
"the first character incremented", cmp.compare(
truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 0);
+ Assert.assertTrue("Truncated upper bound should be greater than or equal to the actual upper bound",
+ cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 0);
+ Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the second unicode " +
+ "character should be incremented", cmp.compare(
+ truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) == 0);
+ Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where the first unicode " +
+ "character should be incremented", cmp.compare(
+ truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) == 0);
}
}