You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@camel.apache.org by da...@apache.org on 2020/02/11 09:11:43 UTC

[camel] branch master updated: Bindy unicode patch (#3552)

This is an automated email from the ASF dual-hosted git repository.

davsclaus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/camel.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e36d91  Bindy unicode patch (#3552)
0e36d91 is described below

commit 0e36d91863957ff0e5d2fc7c92726c0243bfddd9
Author: mgr-lhm <ex...@muenchen.de>
AuthorDate: Tue Feb 11 10:11:28 2020 +0100

    Bindy unicode patch (#3552)
    
    CAMEL-14521: Added Unicode support to bindy fixed length format via icu4j.
---
 components/camel-bindy/pom.xml                     |   8 +-
 .../dataformat/bindy/BindyFixedLengthFactory.java  |  17 +-
 .../camel/dataformat/bindy/UnicodeHelper.java      | 196 +++++++++++++++++
 .../bindy/annotation/FixedLengthRecord.java        |   5 +
 .../camel/dataformat/bindy/UnicodeHelperTest.java  | 232 +++++++++++++++++++++
 parent/pom.xml                                     |   1 +
 6 files changed, 454 insertions(+), 5 deletions(-)

diff --git a/components/camel-bindy/pom.xml b/components/camel-bindy/pom.xml
index 33aa833..e0f84ba 100644
--- a/components/camel-bindy/pom.xml
+++ b/components/camel-bindy/pom.xml
@@ -31,15 +31,17 @@
     <name>Camel :: Bindy</name>
     <description>Camel Bindy data format support</description>
 
-    <properties>
-    </properties>
-
     <dependencies>
 
         <dependency>
             <groupId>org.apache.camel</groupId>
             <artifactId>camel-support</artifactId>
         </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>${icu4j-version}</version>
+        </dependency>
 
         <!-- testing -->
         <dependency>
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
index f14f4d5..f5100ff 100644
--- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java
@@ -69,6 +69,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
     private int recordLength;
     private boolean ignoreTrailingChars;
     private boolean ignoreMissingChars;
+    private boolean countGrapheme;
 
     private Class<?> header;
     private Class<?> footer;
@@ -161,7 +162,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
         // noop
     }
 
-    public void bind(CamelContext camelContext, String record, Map<String, Object> model, int line) throws Exception {
+    public void bind(CamelContext camelContext, String recordStr, Map<String, Object> model, int line) throws Exception {
 
         int pos = 1;
         int counterMandatoryFields = 0;
@@ -171,6 +172,8 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
         int length;
         String delimiter;
         Field field;
+        
+        final UnicodeHelper record = new UnicodeHelper(recordStr, (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS);
 
         // Iterate through the list of positions
         // defined in the @DataField
@@ -217,7 +220,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
                 }
                 offset += length;
             } else if (!delimiter.equals("")) {
-                String tempToken = record.substring(offset - 1, record.length());
+                final UnicodeHelper tempToken = new UnicodeHelper(record.substring(offset - 1, record.length()), (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS);
                 token = tempToken.substring(0, tempToken.indexOf(delimiter));
                 // include the delimiter in the offset calculation
                 offset += token.length() + 1;
@@ -604,6 +607,9 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
 
                 ignoreMissingChars = record.ignoreMissingChars();
                 LOG.debug("Enable ignore missing chars: {}", ignoreMissingChars);
+                
+                countGrapheme = record.countGrapheme();
+                LOG.debug("Enable grapheme counting instead of codepoints: {}", countGrapheme);               
             }
         }
 
@@ -712,4 +718,11 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin
         return ignoreMissingChars;
     }
 
+    /**
+     * Flag indicating whether graphemes or codepoints are counted.
+     */
+    public boolean isCountGrapheme() {
+        return countGrapheme;
+    }
+
 }
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java
new file mode 100644
index 0000000..f55e4e2
--- /dev/null
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.dataformat.bindy;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.ibm.icu.text.BreakIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class replicates the essential parts of the String class in order to aid
+ * proper work for Unicode chars in the presense of UTF-16. So for all operations 
+ * please see {@link String} with the same signature. This class is equally immutable.
+ */
+public class UnicodeHelper implements Serializable {
+    /**
+     * Defines how length if a string is defined, i.e how chars are counted.
+     */
+    public enum Method {
+        /**
+         * One "char" is one Unicode codepoint, which is the standard case.
+         */
+        CODEPOINTS,
+        
+        /**
+         * One "char" is one graphem.
+         */
+        GRAPHEME;
+    }
+    
+    private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class);
+    
+    private String input;
+    
+    private List<Integer> splitted;
+
+    private Method method;
+    
+    /**
+     * Create instance.
+     * 
+     * @param input
+     *         String, that is to be wrapped.
+     * @param method 
+     *         Method, that is used to determin "chars" of string.
+     */
+    public UnicodeHelper(final String input, final Method method) {
+        this.input = input;
+        this.method = method;
+        this.splitted = null;
+    }
+
+    /**
+     * For Serialization only!
+     */
+    protected UnicodeHelper() {
+        // Empty
+    }
+
+    /**
+     * @return
+     *         Returns the method used to determining the string length.
+     */
+    public Method getMethod() {
+        return method;
+    }
+
+    /**
+     * @see String#substring(int)
+     */
+    public String substring(final int beginIndex) {
+        split();
+        
+        final int beginChar = splitted.get(beginIndex);
+        return input.substring(beginChar);
+    }
+    
+    /**
+     * @see String#substring(int, int)
+     */
+    public String substring(final int beginIndex, final int endIndex) {
+        split();
+        
+        final int beginChar = splitted.get(beginIndex);
+        final int endChar = splitted.get(endIndex);
+        return input.substring(beginChar, endChar);
+    }
+
+    /**
+     * @see String#length()
+     */
+    public int length() {
+        split();
+        
+        return splitted.size() - 1;
+    }
+    
+    /**
+     * @see String#indexOf(String)
+     */
+    public int indexOf(final String str) {
+        split();
+        
+        final int tempIdx = input.indexOf(str);
+        if (tempIdx < 0) {
+            return tempIdx;
+        }
+        
+        for (int b = 0; b < splitted.size() - 1; b++) {
+            if (tempIdx == splitted.get(b)) {
+                for (int e = b + 1; e < splitted.size() - 1; e++) {
+                    if (tempIdx + str.length() == splitted.get(e)) {
+                        return b;
+                    }
+                }
+            }
+        }
+        
+        final String cps = str.codePoints().mapToObj(cp -> String.format("0x%X", cp)).collect(Collectors.joining(","));
+        throw new IllegalArgumentException("Given string (" + cps + ") is not a valid sequence of " + this.method + "s.");
+    }
+    
+    private void split() {
+        if (this.splitted != null) {
+            return;
+        }
+        
+        if (method.equals(Method.CODEPOINTS)) {
+            splitCodepoints();
+            
+        } else /* (method.equals(Method.GRAPHEME)) */ {
+            splitGrapheme();
+        }
+        
+        LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method);
+        if (LOG.isTraceEnabled()) {
+            for (int i = 0; i < splitted.size() - 2; i++) {
+                LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1), input.substring(splitted.get(i), splitted.get(i + 1)));
+            }
+        }
+    }
+
+    private void splitCodepoints() {
+        final List<Integer> result = new ArrayList<>();
+        
+        int i = 0;
+        final int len = input.length();
+        while (i < len) {
+            result.add(i);
+            i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1; 
+        }
+        result.add(len);
+        
+        this.splitted = result;
+    }
+
+    private void splitGrapheme() {
+        final List<Integer> result = new ArrayList<>();
+
+        // 
+        // Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here, 
+        //          since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly.
+        //
+        final BreakIterator bit = BreakIterator.getCharacterInstance();
+        bit.setText(input);
+        
+        result.add(bit.first());
+        for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) {
+            result.add(end);
+        }
+        this.splitted = result;
+    }
+
+    @Override
+    public String toString() {
+        return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]";
+    }
+}
diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
index cff27fc..d8d93ae 100644
--- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
+++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java
@@ -98,4 +98,9 @@ public @interface FixedLengthRecord {
      * Indicates whether too short lines will be ignored
      */
     boolean ignoreMissingChars() default false;
+    
+    /**
+     * Indicates how chars are counted
+     */
+    boolean countGrapheme() default true;
 }
diff --git a/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java
new file mode 100644
index 0000000..fad38e9
--- /dev/null
+++ b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.dataformat.bindy;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import org.apache.camel.dataformat.bindy.UnicodeHelper.Method;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+@SuppressWarnings("javadoc")
+public class UnicodeHelperTest {
+    
+    private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelperTest.class);    
+
+    private static final String UCSTR = cps2String(
+        0x1f645, // FACE WITH NO GOOD GESTURE; Basiszeichen (Geste)
+        0x1f3ff, // EMOJI MODIFIER FITZPATRICK TYPE-6; Hautfarbe für #1
+        0x200d,  // ZERO WIDTH JOINER [ZWJ]; Steuerzeichen zum Verbinden
+        0x2642,  // MALE SIGN; Geschlecht für #1
+        0xfe0f   // VARIATION SELECTOR-16 [VS16]; Darstellung als Piktogramm für #4                
+    );
+
+    @Test
+    public void testLengthCPs() {
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+        Assert.assertEquals(1, lh.length());
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+        Assert.assertEquals(1, lh2.length());
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+        Assert.assertEquals(5, lh3.length());
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+        Assert.assertEquals(7, lh4.length());
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+        Assert.assertEquals(3, lh5.length());
+    }    
+
+    @Test
+    public void testLengthGrapheme() {
+
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+        Assert.assertEquals(1, lh.length());
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+        Assert.assertEquals(1, lh2.length());
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+        Assert.assertEquals(1, lh3.length());
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+        Assert.assertEquals(3, lh4.length());
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+        Assert.assertEquals(2, lh5.length());
+    }
+    
+    @Test
+    public void testSubstringCPs() throws FileNotFoundException, IOException {
+
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+        Assert.assertEquals("a", lh.substring(0));
+
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+        Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0));
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+        Assert.assertEquals(UCSTR, lh3.substring(0));
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+        Assert.assertEquals(UCSTR + "A", lh4.substring(1));
+        Assert.assertEquals(new String(Character.toChars(0x1f3ff)) + "\u200d\u2642\ufe0fA", lh4.substring(2));
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+        Assert.assertEquals("\u035fh", lh5.substring(1));
+    }    
+
+    @Test
+    public void testSubstringGrapheme() throws FileNotFoundException, IOException {
+
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+        Assert.assertEquals("a", lh.substring(0));
+
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+        Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0));
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+        Assert.assertEquals(UCSTR, lh3.substring(0));
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+        Assert.assertEquals(UCSTR + "A", lh4.substring(1));
+        Assert.assertEquals("A", lh4.substring(2));
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+        Assert.assertEquals("h", lh5.substring(1));
+    }    
+    
+    @Test
+    public void testSubstringCPs2() {
+
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+        Assert.assertEquals("a", lh.substring(0, 1));
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS);
+        Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1));    
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS);
+        Assert.assertEquals(new String(Character.toChars(0x1f645)), lh3.substring(0, 1));
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS);
+        Assert.assertEquals("a", lh4.substring(0, 1));
+        Assert.assertEquals(new String(Character.toChars(0x1f645)), lh4.substring(1, 2));
+        Assert.assertEquals(new String(Character.toChars(0x1f3ff)), lh4.substring(2, 3));
+        Assert.assertEquals("a" + new String(Character.toChars(0x1f645)), lh4.substring(0, 2));
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS);
+        Assert.assertEquals("k", lh5.substring(0, 1));
+        Assert.assertEquals("\u035f", lh5.substring(1, 2));        
+    }        
+    
+    @Test
+    public void testSubstringGrapheme2() {
+
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+        Assert.assertEquals("a", lh.substring(0, 1));
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME);
+        Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1));    
+
+        final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME);
+        Assert.assertEquals(UCSTR, lh3.substring(0, 1));
+
+        final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME);
+        Assert.assertEquals("a", lh4.substring(0, 1));
+        Assert.assertEquals(UCSTR, lh4.substring(1, 2));
+        Assert.assertEquals("A", lh4.substring(2, 3));
+        Assert.assertEquals("a" + UCSTR, lh4.substring(0, 2));
+        
+        final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME);
+        Assert.assertEquals("k\u035f", lh5.substring(0, 1));
+        Assert.assertEquals("h", lh5.substring(1, 2));        
+    }
+    
+    @Test
+    public void testIndexOf() {
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS);
+        Assert.assertEquals(-1, lh.indexOf("b"));
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(
+            "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z",
+            Method.CODEPOINTS);
+        
+        Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600))));
+        
+        Assert.assertEquals(3, lh2.indexOf(UCSTR));
+        
+        Assert.assertEquals(10, lh2.indexOf("\u035f"));
+        
+        expectIllegalArgumentException(() -> {
+            lh2.indexOf(Character.toString(Character.toChars(0x1f600)[0])); // UTF-16  surrogates are no codepoints.
+        });
+    }
+    
+    @Test
+    public void testIndexOf2() {
+        final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME);
+        Assert.assertEquals(-1, lh.indexOf("b"));
+        
+        final UnicodeHelper lh2 = new UnicodeHelper(
+            "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z",
+            Method.GRAPHEME);
+        
+        Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600))));
+        
+        Assert.assertEquals(3, lh2.indexOf(UCSTR));
+        
+        expectIllegalArgumentException(() -> {
+            lh2.indexOf("\u035f"); // Codepoint of dangling combing char is not a "unicode char".
+        });
+    }    
+    
+    private void expectIllegalArgumentException(final Runnable r) {
+        try {
+            r.run();
+            Assert.assertTrue("We do not expect to reach here -- missing IllegalArgumentException.", false);
+            
+        } catch (final IllegalArgumentException e) {
+            LOG.debug("Caught expected IllegalArgumentException", e);
+            
+        }
+    }
+    
+    private static String cps2String(final int... cps) {
+        final StringBuilder buf = new StringBuilder();
+        for (int cp : cps) {
+            buf.append(Character.toChars(cp));
+        }
+        final String result = buf.toString();
+        
+        if (LOG.isDebugEnabled()) {
+            final String cpStr = Arrays.stream(cps).boxed()
+                .map(i -> "0x" + Integer.toString(i, 16))
+                .collect(Collectors.joining(", "));
+            LOG.debug("Built string '{}' from CPs [ {} ].", result, cpStr);
+        }
+        
+        return result;
+    }
+}
diff --git a/parent/pom.xml b/parent/pom.xml
index a94002e..7e43826 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -282,6 +282,7 @@
         <hystrix-bundle-version>1.5.18_1</hystrix-bundle-version>
         <ibatis-bundle-version>2.3.4.726_4</ibatis-bundle-version>
         <ical4j-version>1.0.7</ical4j-version>
+        <icu4j-version>65.1</icu4j-version>
         <ignite-version>2.7.6</ignite-version>
         <infinispan-version>10.1.1.Final</infinispan-version>
         <influx-java-driver-version>2.17</influx-java-driver-version>