You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by re...@apache.org on 2018/02/07 15:26:04 UTC

svn commit: r1823479 - in /jackrabbit/oak/trunk/oak-segment-tar/src: main/java/org/apache/jackrabbit/oak/segment/util/ test/java/org/apache/jackrabbit/oak/segment/util/

Author: reschke
Date: Wed Feb  7 15:26:04 2018
New Revision: 1823479

URL: http://svn.apache.org/viewvc?rev=1823479&view=rev
Log:
OAK-7249: Create charset encoding utility that detects malformed input

Added:
    jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java   (with props)
    jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/
    jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java   (with props)

Added: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java?rev=1823479&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java (added)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java Wed Feb  7 15:26:04 2018
@@ -0,0 +1,68 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.jackrabbit.oak.segment.util;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Utility class related to encoding characters into (UTF-8) byte sequences.
+ */
+public class CharsetEncodingUtils {
+
+    private CharsetEncodingUtils() {
+    }
+
+    private static ThreadLocal<CharsetEncoder> CSE = new ThreadLocal<CharsetEncoder>() {
+        @Override
+        protected CharsetEncoder initialValue() {
+            CharsetEncoder e = StandardCharsets.UTF_8.newEncoder();
+            e.onUnmappableCharacter(CodingErrorAction.REPORT);
+            e.onMalformedInput(CodingErrorAction.REPORT);
+            return e;
+        }
+    };
+
+    private static byte[] bytes(ByteBuffer b) {
+        byte[] a = new byte[b.remaining()];
+        b.get(a);
+        return a;
+    }
+
+    /**
+     * Like {@link String#getBytes(java.nio.charset.Charset)} (with "UTF-8"),
+     * except that encoding problems (like unpaired surrogates) are reported as
+     * exceptions (see {@link CodingErrorAction#REPORT}, instead of being
+     * silently replaces as it would happen otherwise.
+     * 
+     * @param input
+     *            String to encode
+     * @return String encoded using {@link StandardCharsets#UTF_8}
+     * @throws IOException
+     *             on encoding error
+     */
+    public static byte[] encodeAsUTF8(String input) throws IOException {
+        CharsetEncoder e = CSE.get();
+        e.reset();
+        return bytes(e.encode(CharBuffer.wrap(input.toCharArray())));
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
------------------------------------------------------------------------------
    svn:executable = *

Added: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java?rev=1823479&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java (added)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java Wed Feb  7 15:26:04 2018
@@ -0,0 +1,92 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.jackrabbit.oak.segment.util;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.UUID;
+
+import org.junit.Test;
+
+public class CharsetEncodingUtilsTest {
+
+    private static char[] SURROGATE_PAIR = Character.toChars(0x1f4a9);
+
+    @Test
+    public void encodeValid() throws IOException {
+
+        // a a-umlaut euro plane-1-char
+        String test = "a \u00E4 \u20ac " + new String(SURROGATE_PAIR);
+
+        byte[] withStringClass = test.getBytes(StandardCharsets.UTF_8);
+        byte[] withUtilsClass = CharsetEncodingUtils.encodeAsUTF8(test);
+        assertArrayEquals(withStringClass, withUtilsClass);
+        assertEquals(test, new String(withUtilsClass, StandardCharsets.UTF_8));
+    }
+
+    @Test
+    public void encodeInValid() {
+
+        // a a-umlaut euro plane-1-char, second char in surrogate pair missing
+        String test = "a \u00E4 \u20ac " + SURROGATE_PAIR[0];
+
+        try {
+            CharsetEncodingUtils.encodeAsUTF8(test);
+            fail("expected encoding to fail");
+        } catch (IOException expected) {
+            // expected
+        }
+    }
+
+    @Test
+    public void encodeMultiThreaded() throws InterruptedException {
+
+        int tc = 20;
+
+        Thread[] threads = new Thread[tc];
+
+        for (int i = 0; i < tc; i++) {
+            threads[i] = new Thread(new Runnable() {
+                @Override
+                public void run() {
+                    // encode and decode 100 random UUID strings
+                    for (int j = 0; j < 100; j++) {
+                        String test = UUID.randomUUID().toString();
+                        String roundtripped = null;
+                        try {
+                            byte[] bytes = CharsetEncodingUtils.encodeAsUTF8(test);
+                            roundtripped = new String(bytes, StandardCharsets.UTF_8);
+                        } catch (IOException exignored) {
+                        }
+                        assertEquals(test, roundtripped);
+                    }
+
+                }
+            });
+        }
+        for (int i = 0; i < tc; i++) {
+            threads[i].start();
+        }
+        for (int i = 0; i < tc; i++) {
+            threads[i].join();
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
------------------------------------------------------------------------------
    svn:executable = *