You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2011/12/30 19:44:19 UTC
svn commit: r1225889 - in /incubator/jena/Jena2/ARQ/trunk/src:
main/java/org/openjena/atlas/io/BlockUTF8.java
test/java/org/openjena/atlas/io/TS_IO.java
test/java/org/openjena/atlas/io/TestBlockUTF8.java
test/java/org/openjena/atlas/io/TestStreamUTF8.java
Author: andy
Date: Fri Dec 30 18:44:19 2011
New Revision: 1225889
URL: http://svn.apache.org/viewvc?rev=1225889&view=rev
Log:
Conversion to/from UTF-8 bytes.
Added:
incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java
incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java
Modified:
incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java
incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java
Added: incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java?rev=1225889&view=auto
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java (added)
+++ incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java Fri Dec 30 18:44:19 2011
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.openjena.atlas.io ;
+
+import java.io.IOException ;
+import java.nio.ByteBuffer ;
+import java.nio.CharBuffer ;
+
+import org.openjena.atlas.AtlasException ;
+
+/**
+ * Convert between bytes and chars, UTF-8 only.
+ *
+ * The usual Charset encoders/decoders are expensive to start up - they are also
+ * not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be
+ * done in code with no lookup tables (which, if used, are cache-unfriendly).
+ */
+
+public class BlockUTF8
+{
+ private static Convert converter = new ConvertUTF8() ;
+ private static Convert asciiConvert = new ConvertAscii() ;
+
+ public static void toChars(ByteBuffer bb, CharBuffer cb)
+ {
+ int len = bb.remaining() ;
+
+ for (int i = 0; i < len;)
+ {
+ i += converter.convertBytesToChar(bb, cb) ;
+ }
+ }
+
+ public static void fromChars(CharBuffer cb, ByteBuffer bb)
+ {
+ int len = cb.remaining() ;
+
+ for (int i = 0; i < len; i++)
+ {
+ converter.convertCharToBytes(cb, bb) ;
+ }
+ }
+
+ interface Convert
+ {
+ /** Return number of bytes consumed */
+ int convertBytesToChar(ByteBuffer bb, CharBuffer cb) ;
+
+ /** Return number of bytes produced */
+ int convertCharToBytes(CharBuffer cb, ByteBuffer bb) ;
+ }
+
+ // ASCII
+
+ private static final class ConvertAscii implements Convert
+ {
+ @Override
+ public int convertBytesToChar(ByteBuffer bb, CharBuffer cb)
+ {
+ byte b = bb.get() ;
+ // ASCII
+ char c = (char)b ;
+ cb.put(c) ;
+ return 1 ;
+ }
+
+ @Override
+ public int convertCharToBytes(CharBuffer cb, ByteBuffer bb)
+ {
+ char c = cb.get() ;
+ byte b = (byte)(c | 0xFF) ;
+ bb.put(bb) ;
+ return 1 ;
+ }
+ }
+
+ private static final class ConvertUTF8 implements Convert
+ {
+ @Override
+ public int convertBytesToChar(ByteBuffer bb, CharBuffer cb)
+ {
+ int x = bb.get() ;
+ if ( x > 0 && x <= 127 )
+ {
+ cb.put((char)x) ;
+ return 1 ;
+ }
+
+ // 10 => extension byte
+ // 110..... => 2 bytes
+ if ( (x & 0xE0) == 0xC0 )
+ {
+// // Unwind.
+// int ch = readMultiBytes(bb, x & 0x1F, 2) ;
+ int x2 = bb.get() ;
+ if ( (x2 & 0xC0) != 0x80 )
+ //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
+ throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
+ // 6 bits of x2
+ int ch = ( (x&0x1F) << 6) | (x2 & 0x3F);
+ cb.put((char)ch) ;
+ return 2 ;
+
+ }
+ // 1110.... => 3 bytes : 16 bits : not outside 16bit chars
+ if ( (x & 0xF0) == 0xE0 )
+ {
+ int ch = readMultiBytes(bb, x & 0x0F, 3) ;
+ cb.put((char)ch) ;
+ return 3 ;
+ }
+
+ // Looking like 4 byte charcater.
+ // 11110zzz => 4 bytes.
+ if ( (x & 0xF8) == 0xF0 )
+ {
+ int ch = readMultiBytes(bb, x & 0x08, 4) ;
+ char chars[] = Character.toChars(ch) ;
+ cb.put(chars) ;
+ return 4 ;
+ }
+ else
+ {
+ IO.exception(new IOException("Illegal UTF-8: "+x)) ;
+ return -1 ;
+ }
+
+// // This test will go off. We're processing a 4 byte sequence but Java only supports 16 bit chars.
+// if ( ch > Character.MAX_VALUE )
+// throw new AtlasException("Out of range character (must use a surrogate pair)") ;
+// if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
+// return ch ;
+ }
+
+ private static int readMultiBytes(ByteBuffer input, int start, int len) //throws IOException
+ {
+ int x = start ;
+ for ( int i = 0 ; i < len-1 ; i++ )
+ {
+ int x2 = input.get() ;
+ if ( x2 == -1 )
+ throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;
+
+ if ( (x2 & 0xC0) != 0x80 )
+ //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
+ throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
+ // 6 bits of x2
+ x = (x << 6) | (x2 & 0x3F);
+ }
+ return x ;
+ }
+
+ @Override
+ public int convertCharToBytes(CharBuffer cb, ByteBuffer bb)
+ {
+ char ch = cb.get() ;
+ if ( ch != 0 && ch <= 127 )
+ {
+ // 7 bits
+ bb.put((byte)ch) ;
+ return 1 ;
+ }
+
+ if ( ch == 0 )
+ {
+ // Modified UTF-8.
+ bb.put((byte)0xC0) ;
+ bb.put((byte)0x80) ;
+ return 2 ;
+ }
+
+ if ( ch <= 0x07FF )
+ {
+ // 11 bits : 110yyyyy 10xxxxxx
+ // int x1 = ( ((ch>>(11-5))&0x7) | 0xC0 ) ; outputBytes(out, x1, 2, ch) ; return ;
+ int x1 = ( ((ch>>(11-5))&0x01F ) | 0xC0 ) ;
+ int x2 = ( (ch&0x3F) | 0x80 ) ;
+ bb.put((byte)x1) ;
+ bb.put((byte)x2) ;
+ return 2 ;
+ }
+ if ( ch <= 0xFFFF )
+ {
+ // 16 bits : 1110aaaa 10bbbbbb 10cccccc
+ // int x1 = ( ((ch>>(16-4))&0x7) | 0xE0 ) ; outputBytes(out, x1, 3, ch) ; return ;
+ int x1 = ( ((ch>>(16-4))&0x0F) | 0xE0 ) ;
+ int x2 = ( ((ch>>6)&0x3F) | 0x80 ) ;
+ int x3 = ( (ch&0x3F) | 0x80 ) ;
+ bb.put((byte)x1) ;
+ bb.put((byte)x2) ;
+ bb.put((byte)x3) ;
+ return 3 ;
+ }
+
+// if ( Character.isDefined(ch) )
+// throw new AtlasException("not a character") ;
+
+ //if ( true ) throw new InternalErrorException("Valid code point for Java but not encodable") ;
+
+ // Not java, where chars are 16 bit.
+ if ( ch <= 0x1FFFFF )
+ {
+ // 21 bits : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ int x1 = ( ((ch>>(21-3))&0x7) | 0xF0 ) ;
+ outputBytes(bb, x1, 4, ch) ;
+ return 4 ;
+ }
+ if ( ch <= 0x3FFFFFF )
+ {
+ // 26 bits : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ int x1 = ( ((ch>>(26-2))&0x3) | 0xF8 ) ;
+ outputBytes(bb, x1, 5, ch) ;
+ return 5 ;
+ }
+
+ if ( ch <= 0x7FFFFFFF )
+ {
+ // 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ int x1 = ( ((ch>>(32-1))&0x1) | 0xFC ) ;
+ outputBytes(bb, x1, 6, ch) ;
+ return 6 ;
+ }
+
+ return -1 ;
+ }
+
+ /*
+ * Bits
+ * 7 U+007F 1 to 127 0xxxxxxx
+ * 11 U+07FF 128 to 2,047 110xxxxx 10xxxxxx
+ * 16 U+FFFF 2,048 to 65,535 1110xxxx 10xxxxxx 10xxxxxx
+ * 21 U+1FFFFF 65,536 to 1,114,111 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+ private static void outputBytes(ByteBuffer bb, int x1, int byteLength, int ch)
+ {
+ // ByteLength = 3 => 2 byteLenth => shift=6 and shift=0
+ bb.put((byte)x1) ;
+ byteLength-- ; // remaining bytes
+ for ( int i = 0 ; i < byteLength ; i++ )
+ {
+ // 6 Bits, loop from high to low
+ int shift = 6*(byteLength-i-1) ;
+ int x = (ch>>shift) & 0x3F ;
+ x = x | 0x80 ; // 10xxxxxx
+ bb.put((byte)x) ;
+ }
+ }
+ }
+
+}
Modified: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java?rev=1225889&r1=1225888&r2=1225889&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java Fri Dec 30 18:44:19 2011
@@ -28,6 +28,7 @@ import org.junit.runners.Suite ;
TestIndentedWriter.class
, TestBufferingWriter.class
, TestStreamUTF8.class
+ , TestBlockUTF8.class
, TestInputStreamBuffered.class
// Peek readers.
Added: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java?rev=1225889&view=auto
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java (added)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java Fri Dec 30 18:44:19 2011
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.openjena.atlas.io;
+
+import java.io.ByteArrayOutputStream ;
+import java.io.IOException ;
+import java.io.OutputStreamWriter ;
+import java.io.Writer ;
+import java.nio.ByteBuffer ;
+import java.nio.CharBuffer ;
+import java.nio.charset.Charset ;
+import java.nio.charset.CharsetDecoder ;
+import java.nio.charset.CharsetEncoder ;
+
+import org.junit.Test ;
+import org.openjena.atlas.io.BlockUTF8 ;
+import org.openjena.atlas.junit.BaseTest ;
+import org.openjena.atlas.lib.Chars ;
+
+public class TestBlockUTF8 extends BaseTest
+ {
+ static Charset utf8 = Chars.charsetUTF8 ;
+ static CharsetDecoder dec = utf8.newDecoder() ;
+ static CharsetEncoder enc = utf8.newEncoder() ;
+
+ // UTF-8 encoding.
+ // character '¢' = code point U+00A2 -> C2 A2
+ // character 'â¬' = code point U+20AC -> E2 82 AC
+
+ static private final String asciiBase = "abc" ;
+ static private final String latinBase = "ÃéÃÿ" ;
+ static private final String latinExtraBase = "ỹï¬ï¬" ; // fi-ligature, fl-ligature
+ static private final String greekBase = "αβγ" ;
+ static private final String hewbrewBase = "×××" ;
+ static private final String arabicBase = "ءآأ";
+ static private final String symbolsBase = "âºâ»âªâ«" ;
+ static private final String chineseBase = "å«åå
µæ³" ; // The Art of War
+ static private final String japaneseBase = "æ¥æ¬" ; // Japanese
+
+ @Test public void convert_in_00() { testIn("") ; }
+ @Test public void convert_in_01() { testIn(asciiBase) ; }
+ @Test public void convert_in_02() { testIn(latinBase) ; }
+ @Test public void convert_in_03() { testIn(latinExtraBase) ; }
+ @Test public void convert_in_04() { testIn(greekBase) ; }
+ @Test public void convert_in_05() { testIn(hewbrewBase) ; }
+ @Test public void convert_in_06() { testIn(arabicBase) ; }
+ @Test public void convert_in_07() { testIn(symbolsBase) ; }
+ @Test public void convert_in_08() { testIn(chineseBase) ; }
+ @Test public void convert_in_09() { testIn(japaneseBase) ; }
+
+ @Test public void convert_out_00() { testOut("") ; }
+ @Test public void convert_out_01() { testOut(asciiBase) ; }
+ @Test public void convert_out_02() { testOut(latinBase) ; }
+ @Test public void convert_out_03() { testOut(latinExtraBase) ; }
+ @Test public void convert_out_04() { testOut(greekBase) ; }
+ @Test public void convert_out_05() { testOut(hewbrewBase) ; }
+ @Test public void convert_out_06() { testOut(arabicBase) ; }
+ @Test public void convert_out_07() { testOut(symbolsBase) ; }
+ @Test public void convert_out_08() { testOut(chineseBase) ; }
+ @Test public void convert_out_09() { testOut(japaneseBase) ; }
+
+ static void testIn(String x)
+ {
+ // Correct answer, in bytes
+ ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
+ // To bytes.
+ int N = x.length() ;
+ CharBuffer cb = CharBuffer.wrap(x.toCharArray()) ;
+ ByteBuffer bb = ByteBuffer.allocate(4*N) ;
+ BlockUTF8.fromChars(cb, bb) ;
+ bb.flip() ;
+// ByteBufferLib.print(bytes) ;
+// ByteBufferLib.print(bb) ;
+
+ assertTrue("Bytes", sameBytes(bytes, bb)) ;
+
+ // From bytes.
+ CharBuffer cb2 = CharBuffer.allocate(N) ;
+ BlockUTF8.toChars(bb, cb2) ;
+ String str = new String(cb2.array(), 0, cb2.position()) ;
+ assertEquals(x, str) ;
+ }
+
+ // Does not move position.
+ public static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2)
+ {
+ if ( bb1.remaining() != bb2.remaining() ) return false ;
+
+ for ( int i = 0 ; i < bb1.remaining() ; i++ )
+ if ( bb1.get(i+bb1.position()) != bb2.get(i+bb2.position()) ) return false ;
+ return true ;
+ }
+
+ static void testOut(String x)
+ {
+ int N = x.length() ;
+ // First - get bytes the Java way.
+ ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
+ CharBuffer cb = CharBuffer.allocate(N) ;
+
+ BlockUTF8.toChars(bytes, cb) ;
+ bytes.flip() ;
+ String str = new String(cb.array(), 0, cb.position()) ;
+ cb.flip() ;
+
+ ByteBuffer bytes2 = ByteBuffer.allocate(bytes.capacity()) ;
+ BlockUTF8.fromChars(cb, bytes2) ;
+ bytes2.flip() ;
+
+ assertTrue("Chars", sameBytes(bytes, bytes2)) ;
+ }
+
+ static byte[] stringAsBytes(String x)
+ {
+ try {
+ ByteArrayOutputStream bout = new ByteArrayOutputStream() ;
+ Writer out = new OutputStreamWriter(bout, utf8) ;
+ out.write(x) ;
+ out.close() ;
+ byte[] bytes = bout.toByteArray() ;
+ return bytes ;
+ } catch (IOException ex) { throw new RuntimeException(ex) ; }
+ }
+ }
Modified: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java?rev=1225889&r1=1225888&r2=1225889&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java Fri Dec 30 18:44:19 2011
@@ -51,6 +51,7 @@ public class TestStreamUTF8 extends Base
static private final String chineseBase = "å«åå
µæ³" ; // The Art of War
static private final String japaneseBase = "æ¥æ¬" ; // Japanese
+ @Test public void test_in_00() { testIn("") ; }
@Test public void test_in_01() { testIn(asciiBase) ; }
@Test public void test_in_02() { testIn(latinBase) ; }
@Test public void test_in_03() { testIn(latinExtraBase) ; }
@@ -61,15 +62,16 @@ public class TestStreamUTF8 extends Base
@Test public void test_in_08() { testIn(chineseBase) ; }
@Test public void test_in_09() { testIn(japaneseBase) ; }
- @Test public void test_out_01() { testIn(asciiBase) ; }
- @Test public void test_out_02() { testIn(latinBase) ; }
- @Test public void test_out_03() { testIn(latinExtraBase) ; }
- @Test public void test_out_04() { testIn(greekBase) ; }
- @Test public void test_out_05() { testIn(hewbrewBase) ; }
- @Test public void test_out_06() { testIn(arabicBase) ; }
- @Test public void test_out_07() { testIn(symbolsBase) ; }
- @Test public void test_out_08() { testIn(chineseBase) ; }
- @Test public void test_out_09() { testIn(japaneseBase) ; }
+ @Test public void test_out_00() { testIn("") ; }
+ @Test public void test_out_01() { testOut(asciiBase) ; }
+ @Test public void test_out_02() { testOut(latinBase) ; }
+ @Test public void test_out_03() { testOut(latinExtraBase) ; }
+ @Test public void test_out_04() { testOut(greekBase) ; }
+ @Test public void test_out_05() { testOut(hewbrewBase) ; }
+ @Test public void test_out_06() { testOut(arabicBase) ; }
+ @Test public void test_out_07() { testOut(symbolsBase) ; }
+ @Test public void test_out_08() { testOut(chineseBase) ; }
+ @Test public void test_out_09() { testOut(japaneseBase) ; }
static void testIn(String x)
{