You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@camel.apache.org by da...@apache.org on 2020/02/11 09:11:43 UTC
[camel] branch master updated: Bindy unicode patch (#3552)
This is an automated email from the ASF dual-hosted git repository.
davsclaus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/camel.git
The following commit(s) were added to refs/heads/master by this push:
new 0e36d91 Bindy unicode patch (#3552)
0e36d91 is described below
commit 0e36d91863957ff0e5d2fc7c92726c0243bfddd9
Author: mgr-lhm <ex...@muenchen.de>
AuthorDate: Tue Feb 11 10:11:28 2020 +0100
Bindy unicode patch (#3552)
CAMEL-14521: Added Unicode support to bindy fixed length format via icu4j.
---
components/camel-bindy/pom.xml | 8 +-
.../dataformat/bindy/BindyFixedLengthFactory.java | 17 +-
.../camel/dataformat/bindy/UnicodeHelper.java | 196 +++++++++++++++++
.../bindy/annotation/FixedLengthRecord.java | 5 +
.../camel/dataformat/bindy/UnicodeHelperTest.java | 232 +++++++++++++++++++++
parent/pom.xml | 1 +
6 files changed, 454 insertions(+), 5 deletions(-)
diff --git a/components/camel-bindy/pom.xml b/components/camel-bindy/pom.xml
index 33aa833..e0f84ba 100644
--- a/components/camel-bindy/pom.xml
+++ b/components/camel-bindy/pom.xml
@@ -31,15 +31,17 @@
<name>Camel :: Bindy</name>
<description>Camel Bindy data format support</description>
- <properties>
- </properties>
-
<dependencies>
<dependency>
<groupId>org.apache.camel</groupId>
<artifactId>camel-support</artifactId>
</dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>${icu4j-version}</version>
+ </dependency>
<!-- testing -->
<dependency>
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
index f14f4d5..f5100ff 100644
--- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
@@ -69,6 +69,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
private int recordLength;
private boolean ignoreTrailingChars;
private boolean ignoreMissingChars;
+ private boolean countGrapheme;
private Class<?> header;
private Class<?> footer;
@@ -161,7 +162,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
// noop
}
- public void bind(CamelContext camelContext, String record, Map<String, Object> model, int line) throws Exception {
+ public void bind(CamelContext camelContext, String recordStr, Map<String, Object> model, int line) throws Exception {
int pos = 1;
int counterMandatoryFields = 0;
@@ -171,6 +172,8 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
int length;
String delimiter;
Field field;
+
+ final UnicodeHelper record = new UnicodeHelper(recordStr, (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS);
// Iterate through the list of positions
// defined in the @DataField
@@ -217,7 +220,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
}
offset += length;
} else if (!delimiter.equals("")) {
- String tempToken = record.substring(offset - 1, record.length());
+ final UnicodeHelper tempToken = new UnicodeHelper(record.substring(offset - 1, record.length()), (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS);
token = tempToken.substring(0, tempToken.indexOf(delimiter));
// include the delimiter in the offset calculation
offset += token.length() + 1;
@@ -604,6 +607,9 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
ignoreMissingChars = record.ignoreMissingChars();
LOG.debug("Enable ignore missing chars: {}", ignoreMissingChars);
+
+ countGrapheme = record.countGrapheme();
+ LOG.debug("Enable grapheme counting instead of codepoints: {}", countGrapheme);
}
}
@@ -712,4 +718,11 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
return ignoreMissingChars;
}
+ /**
+ * Flag indicating whether graphemes or codepoints are counted.
+ */
+ public boolean isCountGrapheme() {
+ return countGrapheme;
+ }
+
}
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java
new file mode 100644
index 0000000..f55e4e2
--- /dev/null
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.dataformat.bindy;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.ibm.icu.text.BreakIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class replicates the essential parts of the String class in order to aid
+ * proper work for Unicode chars in the presense of UTF-16. So for all operations
+ * please see {@link String} with the same signature. This class is equally immutable.
+ */
+public class UnicodeHelper implements Serializable {
+ /**
+ * Defines how length if a string is defined, i.e how chars are counted.
+ */
+ public enum Method {
+ /**
+ * One "char" is one Unicode codepoint, which is the standard case.
+ */
+ CODEPOINTS,
+
+ /**
+ * One "char" is one graphem.
+ */
+ GRAPHEME;
+ }
+
+ private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class);
+
+ private String input;
+
+ private List<Integer> splitted;
+
+ private Method method;
+
+ /**
+ * Create instance.
+ *
+ * @param input
+ * String, that is to be wrapped.
+ * @param method
+ * Method, that is used to determin "chars" of string.
+ */
+ public UnicodeHelper(final String input, final Method method) {
+ this.input = input;
+ this.method = method;
+ this.splitted = null;
+ }
+
+ /**
+ * For Serialization only!
+ */
+ protected UnicodeHelper() {
+ // Empty
+ }
+
+ /**
+ * @return
+ * Returns the method used to determining the string length.
+ */
+ public Method getMethod() {
+ return method;
+ }
+
+ /**
+ * @see String#substring(int)
+ */
+ public String substring(final int beginIndex) {
+ split();
+
+ final int beginChar = splitted.get(beginIndex);
+ return input.substring(beginChar);
+ }
+
+ /**
+ * @see String#substring(int, int)
+ */
+ public String substring(final int beginIndex, final int endIndex) {
+ split();
+
+ final int beginChar = splitted.get(beginIndex);
+ final int endChar = splitted.get(endIndex);
+ return input.substring(beginChar, endChar);
+ }
+
+ /**
+ * @see String#length()
+ */
+ public int length() {
+ split();
+
+ return splitted.size() - 1;
+ }
+
+ /**
+ * @see String#indexOf(String)
+ */
+ public int indexOf(final String str) {
+ split();
+
+ final int tempIdx = input.indexOf(str);
+ if (tempIdx < 0) {
+ return tempIdx;
+ }
+
+ for (int b = 0; b < splitted.size() - 1; b++) {
+ if (tempIdx == splitted.get(b)) {
+ for (int e = b + 1; e < splitted.size() - 1; e++) {
+ if (tempIdx + str.length() == splitted.get(e)) {
+ return b;
+ }
+ }
+ }
+ }
+
+ final String cps = str.codePoints().mapToObj(cp -> String.format("0x%X", cp)).collect(Collectors.joining(","));
+ throw new IllegalArgumentException("Given string (" + cps + ") is not a valid sequence of " + this.method + "s.");
+ }
+
+ private void split() {
+ if (this.splitted != null) {
+ return;
+ }
+
+ if (method.equals(Method.CODEPOINTS)) {
+ splitCodepoints();
+
+ } else /* (method.equals(Method.GRAPHEME)) */ {
+ splitGrapheme();
+ }
+
+ LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method);
+ if (LOG.isTraceEnabled()) {
+ for (int i = 0; i < splitted.size() - 2; i++) {
+ LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1), input.substring(splitted.get(i), splitted.get(i + 1)));
+ }
+ }
+ }
+
+ private void splitCodepoints() {
+ final List<Integer> result = new ArrayList<>();
+
+ int i = 0;
+ final int len = input.length();
+ while (i < len) {
+ result.add(i);
+ i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1;
+ }
+ result.add(len);
+
+ this.splitted = result;
+ }
+
+ private void splitGrapheme() {
+ final List<Integer> result = new ArrayList<>();
+
+ //
+ // Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here,
+ // since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly.
+ //
+ final BreakIterator bit = BreakIterator.getCharacterInstance();
+ bit.setText(input);
+
+ result.add(bit.first());
+ for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) {
+ result.add(end);
+ }
+ this.splitted = result;
+ }
+
+ @Override
+ public String toString() {
+ return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]";
+ }
+}
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
index cff27fc..d8d93ae 100644
--- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
@@ -98,4 +98,9 @@ public @interface FixedLengthRecord {
* Indicates whether too short lines will be ignored
*/
boolean ignoreMissingChars() default false;
+
+ /**
+ * Indicates how chars are counted
+ */
+ boolean countGrapheme() default true;
}
diff --git a/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java
new file mode 100644
index 0000000..fad38e9
--- /dev/null
+++ b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.dataformat.bindy;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import org.apache.camel.dataformat.bindy.UnicodeHelper.Method;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+@SuppressWarnings("javadoc")
+public class UnicodeHelperTest {
+
+ private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelperTest.class);
+
+ private static final String UCSTR = cps2String(
+ 0x1f645, // FACE WITH NO GOOD GESTURE; Basiszeichen (Geste)
+ 0x1f3ff, // EMOJI MODIFIER FITZPATRICK TYPE-6; Hautfarbe für #1
+ 0x200d, // ZERO WIDTH JOINER [ZWJ]; Steuerzeichen zum Verbinden
+ 0x2642, // MALE SIGN; Geschlecht für #1
+ 0xfe0f // VARIATION SELECTOR-16 [VS16]; Darstellung als Piktogramm für #4
+ );
+
+ @Test
+ public void testLengthCPs() {
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+ Assert.assertEquals(1, lh.length());
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+ Assert.assertEquals(1, lh2.length());
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+ Assert.assertEquals(5, lh3.length());
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+ Assert.assertEquals(7, lh4.length());
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+ Assert.assertEquals(3, lh5.length());
+ }
+
+ @Test
+ public void testLengthGrapheme() {
+
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+ Assert.assertEquals(1, lh.length());
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+ Assert.assertEquals(1, lh2.length());
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+ Assert.assertEquals(1, lh3.length());
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+ Assert.assertEquals(3, lh4.length());
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+ Assert.assertEquals(2, lh5.length());
+ }
+
+ @Test
+ public void testSubstringCPs() throws FileNotFoundException, IOException {
+
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+ Assert.assertEquals("a", lh.substring(0));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+ Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0));
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+ Assert.assertEquals(UCSTR, lh3.substring(0));
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+ Assert.assertEquals(UCSTR + "A", lh4.substring(1));
+ Assert.assertEquals(new String(Character.toChars(0x1f3ff)) + "\u200d\u2642\ufe0fA", lh4.substring(2));
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+ Assert.assertEquals("\u035fh", lh5.substring(1));
+ }
+
+ @Test
+ public void testSubstringGrapheme() throws FileNotFoundException, IOException {
+
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+ Assert.assertEquals("a", lh.substring(0));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+ Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0));
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+ Assert.assertEquals(UCSTR, lh3.substring(0));
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+ Assert.assertEquals(UCSTR + "A", lh4.substring(1));
+ Assert.assertEquals("A", lh4.substring(2));
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+ Assert.assertEquals("h", lh5.substring(1));
+ }
+
+ @Test
+ public void testSubstringCPs2() {
+
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+ Assert.assertEquals("a", lh.substring(0, 1));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+ Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1));
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+ Assert.assertEquals(new String(Character.toChars(0x1f645)), lh3.substring(0, 1));
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+ Assert.assertEquals("a", lh4.substring(0, 1));
+ Assert.assertEquals(new String(Character.toChars(0x1f645)), lh4.substring(1, 2));
+ Assert.assertEquals(new String(Character.toChars(0x1f3ff)), lh4.substring(2, 3));
+ Assert.assertEquals("a" + new String(Character.toChars(0x1f645)), lh4.substring(0, 2));
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+ Assert.assertEquals("k", lh5.substring(0, 1));
+ Assert.assertEquals("\u035f", lh5.substring(1, 2));
+ }
+
+ @Test
+ public void testSubstringGrapheme2() {
+
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+ Assert.assertEquals("a", lh.substring(0, 1));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+ Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1));
+
+ final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+ Assert.assertEquals(UCSTR, lh3.substring(0, 1));
+
+ final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+ Assert.assertEquals("a", lh4.substring(0, 1));
+ Assert.assertEquals(UCSTR, lh4.substring(1, 2));
+ Assert.assertEquals("A", lh4.substring(2, 3));
+ Assert.assertEquals("a" + UCSTR, lh4.substring(0, 2));
+
+ final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+ Assert.assertEquals("k\u035f", lh5.substring(0, 1));
+ Assert.assertEquals("h", lh5.substring(1, 2));
+ }
+
+ @Test
+ public void testIndexOf() {
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+ Assert.assertEquals(-1, lh.indexOf("b"));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(
+ "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z",
+ Method.CODEPOINTS);
+
+ Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600))));
+
+ Assert.assertEquals(3, lh2.indexOf(UCSTR));
+
+ Assert.assertEquals(10, lh2.indexOf("\u035f"));
+
+ expectIllegalArgumentException(() -> {
+ lh2.indexOf(Character.toString(Character.toChars(0x1f600)[0])); // UTF-16 surrogates are no codepoints.
+ });
+ }
+
+ @Test
+ public void testIndexOf2() {
+ final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+ Assert.assertEquals(-1, lh.indexOf("b"));
+
+ final UnicodeHelper lh2 = new UnicodeHelper(
+ "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z",
+ Method.GRAPHEME);
+
+ Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600))));
+
+ Assert.assertEquals(3, lh2.indexOf(UCSTR));
+
+ expectIllegalArgumentException(() -> {
+ lh2.indexOf("\u035f"); // Codepoint of dangling combing char is not a "unicode char".
+ });
+ }
+
+ private void expectIllegalArgumentException(final Runnable r) {
+ try {
+ r.run();
+ Assert.assertTrue("We do not expect to reach here -- missing IllegalArgumentException.", false);
+
+ } catch (final IllegalArgumentException e) {
+ LOG.debug("Caught expected IllegalArgumentException", e);
+
+ }
+ }
+
+ private static String cps2String(final int... cps) {
+ final StringBuilder buf = new StringBuilder();
+ for (int cp : cps) {
+ buf.append(Character.toChars(cp));
+ }
+ final String result = buf.toString();
+
+ if (LOG.isDebugEnabled()) {
+ final String cpStr = Arrays.stream(cps).boxed()
+ .map(i -> "0x" + Integer.toString(i, 16))
+ .collect(Collectors.joining(", "));
+ LOG.debug("Built string '{}' from CPs [ {} ].", result, cpStr);
+ }
+
+ return result;
+ }
+}
diff --git a/parent/pom.xml b/parent/pom.xml
index a94002e..7e43826 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -282,6 +282,7 @@
<hystrix-bundle-version>1.5.18_1</hystrix-bundle-version>
<ibatis-bundle-version>2.3.4.726_4</ibatis-bundle-version>
<ical4j-version>1.0.7</ical4j-version>
+ <icu4j-version>65.1</icu4j-version>
<ignite-version>2.7.6</ignite-version>
<infinispan-version>10.1.1.Final</infinispan-version>
<influx-java-driver-version>2.17</influx-java-driver-version>