You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by re...@apache.org on 2018/02/07 15:26:04 UTC
svn commit: r1823479 - in /jackrabbit/oak/trunk/oak-segment-tar/src:
main/java/org/apache/jackrabbit/oak/segment/util/
test/java/org/apache/jackrabbit/oak/segment/util/
Author: reschke
Date: Wed Feb 7 15:26:04 2018
New Revision: 1823479
URL: http://svn.apache.org/viewvc?rev=1823479&view=rev
Log:
OAK-7249: Create charset encoding utility that detects malformed input
Added:
jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java (with props)
jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/
jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java (with props)
Added: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java?rev=1823479&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java (added)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java Wed Feb 7 15:26:04 2018
@@ -0,0 +1,68 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.jackrabbit.oak.segment.util;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Utility class related to encoding characters into (UTF-8) byte sequences.
+ */
+public class CharsetEncodingUtils {
+
+ private CharsetEncodingUtils() {
+ }
+
+ private static ThreadLocal<CharsetEncoder> CSE = new ThreadLocal<CharsetEncoder>() {
+ @Override
+ protected CharsetEncoder initialValue() {
+ CharsetEncoder e = StandardCharsets.UTF_8.newEncoder();
+ e.onUnmappableCharacter(CodingErrorAction.REPORT);
+ e.onMalformedInput(CodingErrorAction.REPORT);
+ return e;
+ }
+ };
+
+ private static byte[] bytes(ByteBuffer b) {
+ byte[] a = new byte[b.remaining()];
+ b.get(a);
+ return a;
+ }
+
+ /**
+ * Like {@link String#getBytes(java.nio.charset.Charset)} (with "UTF-8"),
+ * except that encoding problems (like unpaired surrogates) are reported as
+ * exceptions (see {@link CodingErrorAction#REPORT}, instead of being
+ * silently replaces as it would happen otherwise.
+ *
+ * @param input
+ * String to encode
+ * @return String encoded using {@link StandardCharsets#UTF_8}
+ * @throws IOException
+ * on encoding error
+ */
+ public static byte[] encodeAsUTF8(String input) throws IOException {
+ CharsetEncoder e = CSE.get();
+ e.reset();
+ return bytes(e.encode(CharBuffer.wrap(input.toCharArray())));
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtils.java
------------------------------------------------------------------------------
svn:executable = *
Added: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java?rev=1823479&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java (added)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java Wed Feb 7 15:26:04 2018
@@ -0,0 +1,92 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.jackrabbit.oak.segment.util;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.UUID;
+
+import org.junit.Test;
+
+public class CharsetEncodingUtilsTest {
+
+ private static char[] SURROGATE_PAIR = Character.toChars(0x1f4a9);
+
+ @Test
+ public void encodeValid() throws IOException {
+
+ // a a-umlaut euro plane-1-char
+ String test = "a \u00E4 \u20ac " + new String(SURROGATE_PAIR);
+
+ byte[] withStringClass = test.getBytes(StandardCharsets.UTF_8);
+ byte[] withUtilsClass = CharsetEncodingUtils.encodeAsUTF8(test);
+ assertArrayEquals(withStringClass, withUtilsClass);
+ assertEquals(test, new String(withUtilsClass, StandardCharsets.UTF_8));
+ }
+
+ @Test
+ public void encodeInValid() {
+
+ // a a-umlaut euro plane-1-char, second char in surrogate pair missing
+ String test = "a \u00E4 \u20ac " + SURROGATE_PAIR[0];
+
+ try {
+ CharsetEncodingUtils.encodeAsUTF8(test);
+ fail("expected encoding to fail");
+ } catch (IOException expected) {
+ // expected
+ }
+ }
+
+ @Test
+ public void encodeMultiThreaded() throws InterruptedException {
+
+ int tc = 20;
+
+ Thread[] threads = new Thread[tc];
+
+ for (int i = 0; i < tc; i++) {
+ threads[i] = new Thread(new Runnable() {
+ @Override
+ public void run() {
+ // encode and decode 100 random UUID strings
+ for (int j = 0; j < 100; j++) {
+ String test = UUID.randomUUID().toString();
+ String roundtripped = null;
+ try {
+ byte[] bytes = CharsetEncodingUtils.encodeAsUTF8(test);
+ roundtripped = new String(bytes, StandardCharsets.UTF_8);
+ } catch (IOException exignored) {
+ }
+ assertEquals(test, roundtripped);
+ }
+
+ }
+ });
+ }
+ for (int i = 0; i < tc; i++) {
+ threads[i].start();
+ }
+ for (int i = 0; i < tc; i++) {
+ threads[i].join();
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/util/CharsetEncodingUtilsTest.java
------------------------------------------------------------------------------
svn:executable = *