You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2012/12/28 23:09:26 UTC
svn commit: r1426647 - /jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java

Author: andy
Date: Fri Dec 28 22:09:26 2012
New Revision: 1426647

URL: http://svn.apache.org/viewvc?rev=1426647&view=rev
Log:
Code for (but switched off) very lax parsing of URIs.
Such URIs may well cause problems in other parts of Jena.

Modified:
    jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java

Modified: jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java?rev=1426647&r1=1426646&r2=1426647&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java Fri Dec 28 22:09:26 2012
@@ -432,9 +432,6 @@ public final class TokenizerText impleme
         //   Can't start with a number due to numeric test above.
         //   Can't start with a '_' due to blank node test above.
         // If we see a :, the first time it means a prefixed name else it's a token break.
-
-        // (eventually) Make this a very wide definition, including symbols like <= 
-        // Prefixed names are the difficulty.
         
         readPrefixedNameOrKeyword(token) ;
         
@@ -442,6 +439,8 @@ public final class TokenizerText impleme
         return token ;
     }
 
+    private static final boolean VeryVeryLax = false ;
+    
     //static char[] badIRIchars = { '<', '>', '{', '}', '|', '\\', '`', '^', ' ' } ; 
     private String readIRI()
     {
@@ -471,16 +470,38 @@ public final class TokenizerText impleme
             {
                 // N-Triples strictly allows \t\n etc in IRIs (grammar says "string escapes")
                 // ch = strEscapes ? readLiteralEscape() : readUnicodeEscape() ;
-                ch = readUnicodeEscape() ;
+                
+                if ( VeryVeryLax )
+                    ch = readCharEscapeAnyURI() ;
+                else
+                    // NORMAL 
+                    ch = readUnicodeEscape() ;
                 // Drop through.
             }
             // Ban certain very bad characters
-            if ( ch == '<' )
+            if ( !VeryVeryLax && ch == '<' )
                 exception("Broken IRI (bad character: '"+(char)ch+"'): "+stringBuilder.toString(), y, x) ;
             insertCodepoint(stringBuilder, ch) ;
         }
     }
     
+    private final
+    int readCharEscapeAnyURI()
+    {
+        int c = reader.readChar();
+        if ( c==EOF )
+            exception("Escape sequence not completed") ;
+
+        switch (c)
+        {
+            case 'u': return readUnicode4Escape(); 
+            case 'U': return readUnicode8Escape(); 
+            default:
+                // Anything \X
+                return c ;
+        }
+    }
+    
     private void readPrefixedNameOrKeyword(Token token)
     {
         long posn = reader.getPosition() ;