You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2011/12/30 19:44:19 UTC

svn commit: r1225889 - in /incubator/jena/Jena2/ARQ/trunk/src: main/java/org/openjena/atlas/io/BlockUTF8.java test/java/org/openjena/atlas/io/TS_IO.java test/java/org/openjena/atlas/io/TestBlockUTF8.java test/java/org/openjena/atlas/io/TestStreamUTF8.java

Author: andy
Date: Fri Dec 30 18:44:19 2011
New Revision: 1225889

URL: http://svn.apache.org/viewvc?rev=1225889&view=rev
Log:
Conversion to/from UTF-8 bytes.

Added:
    incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java
    incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java
Modified:
    incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java
    incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java

Added: incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java?rev=1225889&view=auto
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java (added)
+++ incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/atlas/io/BlockUTF8.java Fri Dec 30 18:44:19 2011
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.openjena.atlas.io ;
+
+import java.io.IOException ;
+import java.nio.ByteBuffer ;
+import java.nio.CharBuffer ;
+
+import org.openjena.atlas.AtlasException ;
+
+/**
+ * Convert between bytes and chars, UTF-8 only.
+ * 
+ * The usual Charset encoders/decoders are expensive to start up - they are also
+ * not thread safe. Sometimes we want to convert 10's of chars and UTF-8 can be
+ * done in code with no lookup tables (which, if used, are cache-unfriendly).
+ */
+
+public class BlockUTF8
+{
+    private static Convert converter = new ConvertUTF8() ;
+    private static Convert asciiConvert = new ConvertAscii() ;
+
+    public static void toChars(ByteBuffer bb, CharBuffer cb)
+    {
+        int len = bb.remaining() ;
+
+        for (int i = 0; i < len;)
+        {
+            i += converter.convertBytesToChar(bb, cb) ;
+        }
+    }
+
+    public static void fromChars(CharBuffer cb, ByteBuffer bb)
+    {
+        int len = cb.remaining() ;
+
+        for (int i = 0; i < len; i++)
+        {
+            converter.convertCharToBytes(cb, bb) ;
+        }
+    }
+
+    interface Convert
+    {
+        /** Return number of bytes consumed */
+        int convertBytesToChar(ByteBuffer bb, CharBuffer cb) ;
+
+        /** Return number of bytes produced */
+        int convertCharToBytes(CharBuffer cb, ByteBuffer bb) ;
+    }
+
+    // ASCII
+    
+    private static final class ConvertAscii implements Convert
+    {
+        @Override
+        public int convertBytesToChar(ByteBuffer bb, CharBuffer cb)
+        {
+            byte b = bb.get() ;
+            // ASCII
+            char c = (char)b ;
+            cb.put(c) ;
+            return 1 ;
+        }
+
+        @Override
+        public int convertCharToBytes(CharBuffer cb, ByteBuffer bb)
+        {
+            char c = cb.get() ;
+            byte b = (byte)(c | 0xFF) ;
+            bb.put(bb) ;
+            return 1 ;
+        }
+    }
+    
+    private static final class ConvertUTF8 implements Convert
+    {
+        @Override
+        public int convertBytesToChar(ByteBuffer bb, CharBuffer cb)
+        {
+            int x = bb.get() ;
+            if ( x > 0 && x <= 127 )
+            {
+                cb.put((char)x) ;
+                return 1 ;
+            }
+
+            // 10 => extension byte
+            // 110..... => 2 bytes
+            if ( (x & 0xE0) == 0xC0 )
+            {
+//                // Unwind.
+//                int ch = readMultiBytes(bb, x & 0x1F, 2) ;
+                int x2 = bb.get() ;
+                if ( (x2 & 0xC0) != 0x80 )
+                    //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
+                    throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
+                // 6 bits of x2
+                int ch = ( (x&0x1F) << 6) | (x2 & 0x3F); 
+                cb.put((char)ch) ;
+                return 2 ;
+                
+            }
+            //  1110.... => 3 bytes : 16 bits : not outside 16bit chars 
+            if ( (x & 0xF0) == 0xE0 ) 
+            {
+                int ch = readMultiBytes(bb, x & 0x0F, 3) ;
+                cb.put((char)ch) ;
+                return 3 ;
+            }
+
+            // Looking like 4 byte charcater.
+            // 11110zzz => 4 bytes.
+            if ( (x & 0xF8) == 0xF0 )
+            {
+                int ch = readMultiBytes(bb, x & 0x08, 4) ;
+                char chars[] = Character.toChars(ch) ;
+                cb.put(chars) ;
+                return 4 ;
+            }
+            else 
+            {
+                IO.exception(new IOException("Illegal UTF-8: "+x)) ;
+                return -1 ;
+            }
+
+//            // This test will go off.  We're processing a 4 byte sequence but Java only supports 16 bit chars. 
+//            if ( ch > Character.MAX_VALUE )
+//                throw new AtlasException("Out of range character (must use a surrogate pair)") ;
+//            if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
+//            return ch ;
+        }
+
+        private static int readMultiBytes(ByteBuffer input, int start, int len) //throws IOException
+        {
+            int x = start ;
+            for ( int i = 0 ; i < len-1 ; i++ )
+            {
+                int x2 = input.get() ;
+                if ( x2 == -1 )
+                    throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;
+                
+                if ( (x2 & 0xC0) != 0x80 )
+                    //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
+                    throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
+                // 6 bits of x2
+                x = (x << 6) | (x2 & 0x3F); 
+            }
+            return x ;
+        }
+
+        @Override
+        public int convertCharToBytes(CharBuffer cb, ByteBuffer bb)
+        {
+            char ch = cb.get() ;
+            if ( ch != 0 && ch <= 127 )
+            {
+                // 7 bits
+                bb.put((byte)ch) ;
+                return 1 ;
+            }
+            
+            if ( ch == 0 )
+            {
+                // Modified UTF-8.
+                bb.put((byte)0xC0) ;
+                bb.put((byte)0x80) ;
+                return 2 ;
+            }
+            
+            if ( ch <= 0x07FF )
+            {
+                // 11 bits : 110yyyyy 10xxxxxx
+                // int x1 = ( ((ch>>(11-5))&0x7) | 0xC0 ) ; outputBytes(out, x1, 2, ch) ; return ;
+                int x1 = ( ((ch>>(11-5))&0x01F ) | 0xC0 ) ; 
+                int x2 = ( (ch&0x3F)  | 0x80 ) ;
+                bb.put((byte)x1) ;
+                bb.put((byte)x2) ;
+                return 2 ;
+            }
+            if ( ch <= 0xFFFF )
+            {
+                // 16 bits : 1110aaaa  10bbbbbb  10cccccc
+                // int x1 = ( ((ch>>(16-4))&0x7) | 0xE0 ) ; outputBytes(out, x1, 3, ch) ; return ;
+                int x1 = ( ((ch>>(16-4))&0x0F) | 0xE0 ) ;
+                int x2 = ( ((ch>>6)&0x3F) | 0x80 ) ;
+                int x3 = ( (ch&0x3F) | 0x80 ) ;
+                bb.put((byte)x1) ;
+                bb.put((byte)x2) ;
+                bb.put((byte)x3) ;
+                return 3 ;
+            }
+            
+//            if ( Character.isDefined(ch) )
+//                throw new AtlasException("not a character") ;
+            
+            //if ( true ) throw new InternalErrorException("Valid code point for Java but not encodable") ;
+            
+            // Not java, where chars are 16 bit.
+            if ( ch <= 0x1FFFFF )
+            {
+                // 21 bits : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                int x1 = ( ((ch>>(21-3))&0x7) | 0xF0 ) ;
+                outputBytes(bb, x1, 4, ch) ;
+                return 4 ;
+            }
+            if ( ch <= 0x3FFFFFF )
+            {
+                // 26 bits : 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+                int x1 = ( ((ch>>(26-2))&0x3) | 0xF8 ) ;
+                outputBytes(bb, x1, 5, ch) ;
+                return 5 ;
+            }
+
+            if ( ch <= 0x7FFFFFFF )
+            {
+                // 32 bits : 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+                int x1 = ( ((ch>>(32-1))&0x1) | 0xFC ) ;
+                outputBytes(bb, x1, 6, ch) ;
+                return 6 ;
+            }
+            
+            return -1 ;
+        }
+
+        /*
+         * Bits 
+         * 7    U+007F      1 to 127              0xxxxxxx 
+         * 11   U+07FF      128 to 2,047          110xxxxx 10xxxxxx
+         * 16   U+FFFF      2,048 to 65,535       1110xxxx 10xxxxxx 10xxxxxx
+         * 21   U+1FFFFF    65,536 to 1,114,111   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+         * 26   U+3FFFFFF                         111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+         * 31   U+7FFFFFFF                        1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+         */
+        private static void outputBytes(ByteBuffer bb, int x1, int byteLength, int ch)
+        {
+            // ByteLength = 3 => 2 byteLenth => shift=6 and shift=0  
+            bb.put((byte)x1) ;
+            byteLength-- ; // remaining bytes
+            for ( int i = 0 ; i < byteLength ; i++ )
+            {
+                // 6 Bits, loop from high to low  
+                int shift = 6*(byteLength-i-1) ;
+                int x =  (ch>>shift) & 0x3F ;
+                x = x | 0x80 ;  // 10xxxxxx
+                bb.put((byte)x) ;
+            }
+        }
+    }
+
+}

Modified: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java?rev=1225889&r1=1225888&r2=1225889&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TS_IO.java Fri Dec 30 18:44:19 2011
@@ -28,6 +28,7 @@ import org.junit.runners.Suite ;
     TestIndentedWriter.class
     , TestBufferingWriter.class
     , TestStreamUTF8.class
+    , TestBlockUTF8.class
     , TestInputStreamBuffered.class
 
     // Peek readers.

Added: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java?rev=1225889&view=auto
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java (added)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestBlockUTF8.java Fri Dec 30 18:44:19 2011
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.openjena.atlas.io;
+
+import java.io.ByteArrayOutputStream ;
+import java.io.IOException ;
+import java.io.OutputStreamWriter ;
+import java.io.Writer ;
+import java.nio.ByteBuffer ;
+import java.nio.CharBuffer ;
+import java.nio.charset.Charset ;
+import java.nio.charset.CharsetDecoder ;
+import java.nio.charset.CharsetEncoder ;
+
+import org.junit.Test ;
+import org.openjena.atlas.io.BlockUTF8 ;
+import org.openjena.atlas.junit.BaseTest ;
+import org.openjena.atlas.lib.Chars ;
+
+public class TestBlockUTF8 extends BaseTest
+    {
+        static Charset utf8 = Chars.charsetUTF8 ;
+        static CharsetDecoder dec = utf8.newDecoder() ;
+        static CharsetEncoder enc = utf8.newEncoder() ;
+        
+        // UTF-8 encoding.
+        // character '¢' = code point U+00A2 -> C2 A2
+        // character '€' = code point U+20AC -> E2 82 AC
+        
+        static private final String asciiBase             = "abc" ;
+        static private final String latinBase             = "Àéíÿ" ;
+        static private final String latinExtraBase        = "ỹfifl" ;  // fi-ligature, fl-ligature
+        static private final String greekBase             = "αβγ" ;
+        static private final String hewbrewBase           = "אבג" ;
+        static private final String arabicBase            = "ءآأ";
+        static private final String symbolsBase           = "☺☻♪♫" ;
+        static private final String chineseBase           = "孫子兵法" ; // The Art of War 
+        static private final String japaneseBase          = "日本" ;    // Japanese
+        
+        @Test public void convert_in_00() { testIn("") ; }
+        @Test public void convert_in_01() { testIn(asciiBase) ; }
+        @Test public void convert_in_02() { testIn(latinBase) ; }
+        @Test public void convert_in_03() { testIn(latinExtraBase) ; }
+        @Test public void convert_in_04() { testIn(greekBase) ; }
+        @Test public void convert_in_05() { testIn(hewbrewBase) ; }
+        @Test public void convert_in_06() { testIn(arabicBase) ; }
+        @Test public void convert_in_07() { testIn(symbolsBase) ; }
+        @Test public void convert_in_08() { testIn(chineseBase) ; }
+        @Test public void convert_in_09() { testIn(japaneseBase) ; }
+        
+        @Test public void convert_out_00() { testOut("") ; }
+        @Test public void convert_out_01() { testOut(asciiBase) ; }
+        @Test public void convert_out_02() { testOut(latinBase) ; }
+        @Test public void convert_out_03() { testOut(latinExtraBase) ; }
+        @Test public void convert_out_04() { testOut(greekBase) ; }
+        @Test public void convert_out_05() { testOut(hewbrewBase) ; }
+        @Test public void convert_out_06() { testOut(arabicBase) ; }
+        @Test public void convert_out_07() { testOut(symbolsBase) ; }
+        @Test public void convert_out_08() { testOut(chineseBase) ; }
+        @Test public void convert_out_09() { testOut(japaneseBase) ; }
+        
+        static void testIn(String x)
+        {
+            // Correct answer, in bytes
+            ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
+            // To bytes.
+            int N = x.length() ;
+            CharBuffer cb = CharBuffer.wrap(x.toCharArray()) ;
+            ByteBuffer bb = ByteBuffer.allocate(4*N) ;
+            BlockUTF8.fromChars(cb, bb) ;
+            bb.flip() ;
+//            ByteBufferLib.print(bytes) ;
+//            ByteBufferLib.print(bb) ;
+            
+            assertTrue("Bytes", sameBytes(bytes, bb)) ;
+
+            // From bytes.
+            CharBuffer cb2 = CharBuffer.allocate(N) ;
+            BlockUTF8.toChars(bb, cb2) ;
+            String str = new String(cb2.array(), 0, cb2.position()) ;
+            assertEquals(x, str) ;
+        }
+
+        // Does not move position.
+        public static boolean sameBytes(ByteBuffer bb1, ByteBuffer bb2)
+        {
+            if ( bb1.remaining() != bb2.remaining() ) return false ;
+            
+            for ( int i = 0 ; i < bb1.remaining() ; i++ )
+                if ( bb1.get(i+bb1.position()) != bb2.get(i+bb2.position()) ) return false ;
+            return true ;
+        }
+        
+        static void testOut(String x)
+        {
+            int N = x.length() ;
+            // First - get bytes the Java way.
+            ByteBuffer bytes = ByteBuffer.wrap(stringAsBytes(x)) ;
+            CharBuffer cb = CharBuffer.allocate(N) ;
+            
+            BlockUTF8.toChars(bytes, cb) ;
+            bytes.flip() ;
+            String str = new String(cb.array(), 0, cb.position()) ;
+            cb.flip() ;
+
+            ByteBuffer bytes2 = ByteBuffer.allocate(bytes.capacity()) ;
+            BlockUTF8.fromChars(cb, bytes2) ;
+            bytes2.flip() ;
+            
+            assertTrue("Chars", sameBytes(bytes, bytes2)) ;
+        }
+
+        static byte[] stringAsBytes(String x)
+        {
+            try {
+                ByteArrayOutputStream bout = new ByteArrayOutputStream() ;
+                Writer out = new OutputStreamWriter(bout, utf8) ;
+                out.write(x) ;
+                out.close() ;
+                byte[] bytes = bout.toByteArray() ;
+                return bytes ;
+            } catch (IOException ex) { throw new RuntimeException(ex) ; } 
+        }
+    }

Modified: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java?rev=1225889&r1=1225888&r2=1225889&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/atlas/io/TestStreamUTF8.java Fri Dec 30 18:44:19 2011
@@ -51,6 +51,7 @@ public class TestStreamUTF8 extends Base
         static private final String chineseBase           = "孫子兵法" ; // The Art of War 
         static private final String japaneseBase          = "日本" ;    // Japanese
         
+        @Test public void test_in_00() { testIn("") ; }
         @Test public void test_in_01() { testIn(asciiBase) ; }
         @Test public void test_in_02() { testIn(latinBase) ; }
         @Test public void test_in_03() { testIn(latinExtraBase) ; }
@@ -61,15 +62,16 @@ public class TestStreamUTF8 extends Base
         @Test public void test_in_08() { testIn(chineseBase) ; }
         @Test public void test_in_09() { testIn(japaneseBase) ; }
         
-        @Test public void test_out_01() { testIn(asciiBase) ; }
-        @Test public void test_out_02() { testIn(latinBase) ; }
-        @Test public void test_out_03() { testIn(latinExtraBase) ; }
-        @Test public void test_out_04() { testIn(greekBase) ; }
-        @Test public void test_out_05() { testIn(hewbrewBase) ; }
-        @Test public void test_out_06() { testIn(arabicBase) ; }
-        @Test public void test_out_07() { testIn(symbolsBase) ; }
-        @Test public void test_out_08() { testIn(chineseBase) ; }
-        @Test public void test_out_09() { testIn(japaneseBase) ; }
+        @Test public void test_out_00() { testIn("") ; }
+        @Test public void test_out_01() { testOut(asciiBase) ; }
+        @Test public void test_out_02() { testOut(latinBase) ; }
+        @Test public void test_out_03() { testOut(latinExtraBase) ; }
+        @Test public void test_out_04() { testOut(greekBase) ; }
+        @Test public void test_out_05() { testOut(hewbrewBase) ; }
+        @Test public void test_out_06() { testOut(arabicBase) ; }
+        @Test public void test_out_07() { testOut(symbolsBase) ; }
+        @Test public void test_out_08() { testOut(chineseBase) ; }
+        @Test public void test_out_09() { testOut(japaneseBase) ; }
         
         static void testIn(String x)
         {