You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2011/10/05 20:16:44 UTC

svn commit: r1179358 - in /incubator/jena/Jena2/ARQ/trunk/src: main/java/org/openjena/riot/system/RiotChars.java main/java/org/openjena/riot/tokens/TokenizerText.java test/java/org/openjena/riot/tokens/TestTokenizer.java

Author: andy
Date: Wed Oct  5 18:16:44 2011
New Revision: 1179358

URL: http://svn.apache.org/viewvc?rev=1179358&view=rev
Log:
JENA-129

Modified:
    incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/system/RiotChars.java
    incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/tokens/TokenizerText.java
    incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/riot/tokens/TestTokenizer.java

Modified: incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/system/RiotChars.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/system/RiotChars.java?rev=1179358&r1=1179357&r2=1179358&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/system/RiotChars.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/system/RiotChars.java Wed Oct  5 18:16:44 2011
@@ -75,34 +75,34 @@ PN_LOCAL       ::=  ( PN_CHARS_U | [0-9]
     
     public static boolean isPNCharsBase(int ch)
     {
-        //??
-        //int type = Character.getType(ch) ;
-        //Character.COMBINING_SPACING_MARK -> What category are we looking at?
+        // PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | 
+        //                   [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+        //                   [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 
+        //                   [#x10000-#xEFFFF]
         return 
-            r(ch, 'a', 'z') || r(ch, 'A', 'Z') || r(ch, 0x00C0, 0x00D6) || r(ch, 0x00D8, 0x00F6) ||
-            r(ch, 0x00F8, 0x02FF) || r(ch, 0x0370, 0x037D) || r(ch, 0x037F, 0x1FFF) || 
-            r(ch, 0x200C, 0x200D) || r(ch, 0x2070, 0x218F) ||
-            r(ch, 0x2C00 , 0x2FEF) || r(ch, 0x3001, 0xD7FF) || r(ch, 0xF900, 0xFDCF) || r(ch, 0xFDF0, 0xFFFD) ||
+            r(ch, 'a', 'z') || r(ch, 'A', 'Z') || r(ch, 0x00C0, 0x00D6) || r(ch, 0x00D8, 0x00F6) || r(ch, 0x00F8, 0x02FF) ||
+            r(ch, 0x0370, 0x037D) || r(ch, 0x037F, 0x1FFF) || r(ch, 0x200C, 0x200D) || r(ch, 0x2070, 0x218F) ||
+            r(ch, 0x2C00, 0x2FEF) || r(ch, 0x3001, 0xD7FF) || r(ch, 0xF900, 0xFDCF) || r(ch, 0xFDF0, 0xFFFD) ||
             r(ch, 0x10000, 0xEFFFF) ; // Outside the basic plain. 
     }
     
     public static boolean isPNChars_U(int ch)
     {
+        //PN_CHARS_BASE | '_'
         return isPNCharsBase(ch) || ( ch == '_' ) ;
     }
     
     public static boolean isPNChars_U_N(int ch)
     {
+        // PN_CHARS_U | [0-9] 
         return isPNCharsBase(ch) || ( ch == '_' ) || isDigit(ch) ;
     }
     
     public static boolean isPNChars(int ch)
     {
-        // #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
-        return isPNChars_U(ch) || ( ch == '-' ) || ch == 0x00B7 || r(ch, 0x306, 0x036F) || r(ch, 0x203F, 0x2040) ;
+        // PN_CHARS ::=  PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
+        return isPNChars_U(ch) || isDigit(ch) || ( ch == '-' ) || ch == 0x00B7 || r(ch, 0x300, 0x036F) || r(ch, 0x203F, 0x2040) ;
     }
-    
-    
     
     public static int valHexChar(int ch)
     {

Modified: incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/tokens/TokenizerText.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/tokens/TokenizerText.java?rev=1179358&r1=1179357&r2=1179358&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/tokens/TokenizerText.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/main/java/org/openjena/riot/tokens/TokenizerText.java Wed Oct  5 18:16:44 2011
@@ -18,17 +18,47 @@
 
 package org.openjena.riot.tokens;
 
-import static org.openjena.atlas.lib.Chars.* ;
-import static org.openjena.riot.system.RiotChars.* ;
-
-import java.util.NoSuchElementException ;
-
-import org.openjena.atlas.AtlasException ;
-import org.openjena.atlas.io.IO ;
-import org.openjena.atlas.io.PeekReader ;
-import org.openjena.atlas.lib.Chars ;
-import org.openjena.riot.RiotParseException ;
-import org.openjena.riot.system.RiotChars ;
+import static org.openjena.atlas.lib.Chars.CH_AT ;
+import static org.openjena.atlas.lib.Chars.CH_COLON ;
+import static org.openjena.atlas.lib.Chars.CH_COMMA ;
+import static org.openjena.atlas.lib.Chars.CH_DOT ;
+import static org.openjena.atlas.lib.Chars.CH_EQUALS ;
+import static org.openjena.atlas.lib.Chars.CH_GT ;
+import static org.openjena.atlas.lib.Chars.CH_HASH ;
+import static org.openjena.atlas.lib.Chars.CH_LBRACE ;
+import static org.openjena.atlas.lib.Chars.CH_LBRACKET ;
+import static org.openjena.atlas.lib.Chars.CH_LPAREN ;
+import static org.openjena.atlas.lib.Chars.CH_LT ;
+import static org.openjena.atlas.lib.Chars.CH_MINUS ;
+import static org.openjena.atlas.lib.Chars.CH_PLUS ;
+import static org.openjena.atlas.lib.Chars.CH_QMARK ;
+import static org.openjena.atlas.lib.Chars.CH_QUOTE1 ;
+import static org.openjena.atlas.lib.Chars.CH_QUOTE2 ;
+import static org.openjena.atlas.lib.Chars.CH_RBRACE ;
+import static org.openjena.atlas.lib.Chars.CH_RBRACKET ;
+import static org.openjena.atlas.lib.Chars.CH_RPAREN ;
+import static org.openjena.atlas.lib.Chars.CH_SEMICOLON ;
+import static org.openjena.atlas.lib.Chars.CH_STAR ;
+import static org.openjena.atlas.lib.Chars.CH_UNDERSCORE ;
+import static org.openjena.atlas.lib.Chars.CR ;
+import static org.openjena.atlas.lib.Chars.EOF ;
+import static org.openjena.atlas.lib.Chars.NL ;
+import static org.openjena.riot.system.RiotChars.charInArray ;
+import static org.openjena.riot.system.RiotChars.isA2Z ;
+import static org.openjena.riot.system.RiotChars.isA2ZN ;
+import static org.openjena.riot.system.RiotChars.isAlphaNumeric ;
+import static org.openjena.riot.system.RiotChars.isNewlineChar ;
+import static org.openjena.riot.system.RiotChars.isWhitespace ;
+import static org.openjena.riot.system.RiotChars.range ;
+import static org.openjena.riot.system.RiotChars.valHexChar ;
+
+import java.util.NoSuchElementException ;
+
+import org.openjena.atlas.AtlasException ;
+import org.openjena.atlas.io.IO ;
+import org.openjena.atlas.io.PeekReader ;
+import org.openjena.riot.RiotParseException ;
+import org.openjena.riot.system.RiotChars ;
 
 /** Tokenizer for all sorts of things RDF-ish */
 
@@ -497,8 +527,13 @@ public final class TokenizerText impleme
     */
         
     
+    private String readPrefixPart()
+    //{ return readWordSub(false, false) ; }
+    { return readSegment(false) ; }
+    
     private String readLocalPart()
-    { return readWordSub(true, false) ; }
+    //{ return readWordSub(true, false) ; }
+    { return readSegment(true) ; }
 
     private String readSegment(boolean isLocalPart)
     { 
@@ -507,7 +542,7 @@ public final class TokenizerText impleme
         // RiotChars has isPNChars_U_N for   ( PN_CHARS_U | [0-9] )
         stringBuilder.setLength(0) ;
         
-        // First character
+        // -- Test first character
         int ch = reader.peekChar() ;
         if ( ch == EOF )
             return "" ;
@@ -519,35 +554,47 @@ public final class TokenizerText impleme
         {
             if ( ! RiotChars.isPNCharsBase(ch) ) return "" ;
         }
-        
+        // ch is not added to the buffer until ...
+        // -- Do remainer
         stringBuilder.append((char)ch) ;
         reader.readChar() ;
+        int chDot = 0 ;
         
-        boolean canBeLast = true ;
         for (;;)
         {
-            // Put previous chacarer in buffer
-            stringBuilder.append((char)ch) ;
-            reader.readChar() ;
-
             ch = reader.peekChar() ;
-            if ( ! RiotChars.isPNChars(ch) && ch == Chars.CH_DOT )
+            if ( RiotChars.isPNChars(ch) )
+            {
+                reader.readChar() ;
+                // Was there also a DOT?
+                if ( chDot != 0 )
+                {
+                    stringBuilder.append((char)chDot) ;
+                    chDot = 0 ;
+                }
+                stringBuilder.append((char)ch) ;
+                continue ;
+            }
+            // Not isPNChars
+            if ( ch != CH_DOT )
                 break ;
-        }
-        // End condition.
-        if ( ch != Chars.CH_DOT )
-        {
-            stringBuilder.append((char)ch) ;
+            // DOT
             reader.readChar() ;
+            chDot = ch ;
         }
+        // On exit, chDot may hold a character.
+
+        if ( chDot == CH_DOT )
+            // Unread it.
+            reader.pushbackChar(chDot) ;
+        
+        //stringBuilder.deleteCharAt(chDot)
+        
         return stringBuilder.toString() ;
     }
 
 
     
-    private String readPrefixPart()
-    { return readWordSub(false, false) ; }
-
     // Get characters between two markers.
     // strEscapes may be processed
     // endNL end of line as an ending is OK

Modified: incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/riot/tokens/TestTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/riot/tokens/TestTokenizer.java?rev=1179358&r1=1179357&r2=1179358&view=diff
==============================================================================
--- incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/riot/tokens/TestTokenizer.java (original)
+++ incubator/jena/Jena2/ARQ/trunk/src/test/java/org/openjena/riot/tokens/TestTokenizer.java Wed Oct  5 18:16:44 2011
@@ -435,6 +435,47 @@ public class TestTokenizer extends BaseT
         assertEquals("-456", token.getImage()) ;
     }
 
+    
+    @Test
+    public void tokenUnit_pname10()
+    {
+        tokenizeAndTestExact("a:a.b", TokenType.PREFIXED_NAME, "a", "a.b") ;
+    }
+    
+    @Test
+    public void tokenUnit_pname11()
+    {
+        tokenizeAndTestExact("a:0.b", TokenType.PREFIXED_NAME, "a", "0.b") ;
+    }
+    
+    @Test
+    public void tokenUnit_pname12()
+    {
+        tokenizeAndTestFirst("a:0. b", TokenType.PREFIXED_NAME, "a", "0") ;
+    }
+
+    @Test
+    public void tokenUnit_pname13()
+    {
+        // x00e9 é
+        // x0065 e and x0301 ́
+        tokenizeAndTestExact("a:xyzé", TokenType.PREFIXED_NAME, "a", "xyz\u00e9") ;
+    }
+
+    @Test
+    public void tokenUnit_pname14()
+    {
+        // x0065 e and x0301 ́  
+        tokenizeAndTestExact("a:xyze\u0301", TokenType.PREFIXED_NAME, "a", "xyze\u0301") ;
+    }
+
+    @Test
+    public void tokenUnit_pname15()
+    {
+        // x0065 e and x0301 ́  
+        tokenizeAndTestExact("a:xe\u0301y", TokenType.PREFIXED_NAME, "a", "xe\u0301y") ;
+    }
+    
 //    @Test
 //    public void tokenUnit_pname10()
 //    {
@@ -693,6 +734,12 @@ public class TestTokenizer extends BaseT
     }
 
     @Test
+    public void directive_1() { tokenizeAndTestExact("@prefix", TokenType.DIRECTIVE, "prefix") ; }
+    
+    @Test
+    public void directive_2() { tokenizeAndTestExact("@base", TokenType.DIRECTIVE, "base") ; }
+
+    @Test
     public void tokenComment_01()
     {
         tokenizeAndTestExact("_:123 # Comment", TokenType.BNODE, "123") ;