You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2012/12/28 23:09:26 UTC
svn commit: r1426647 -
/jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
Author: andy
Date: Fri Dec 28 22:09:26 2012
New Revision: 1426647
URL: http://svn.apache.org/viewvc?rev=1426647&view=rev
Log:
Code for (but switched off) very lax parsing of URIs.
Such URIs may well cause problems in other parts of Jena.
Modified:
jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
Modified: jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java?rev=1426647&r1=1426646&r2=1426647&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java Fri Dec 28 22:09:26 2012
@@ -432,9 +432,6 @@ public final class TokenizerText impleme
// Can't start with a number due to numeric test above.
// Can't start with a '_' due to blank node test above.
// If we see a :, the first time it means a prefixed name else it's a token break.
-
- // (eventually) Make this a very wide definition, including symbols like <=
- // Prefixed names are the difficulty.
readPrefixedNameOrKeyword(token) ;
@@ -442,6 +439,8 @@ public final class TokenizerText impleme
return token ;
}
+ private static final boolean VeryVeryLax = false ;
+
//static char[] badIRIchars = { '<', '>', '{', '}', '|', '\\', '`', '^', ' ' } ;
private String readIRI()
{
@@ -471,16 +470,38 @@ public final class TokenizerText impleme
{
// N-Triples strictly allows \t\n etc in IRIs (grammar says "string escapes")
// ch = strEscapes ? readLiteralEscape() : readUnicodeEscape() ;
- ch = readUnicodeEscape() ;
+
+ if ( VeryVeryLax )
+ ch = readCharEscapeAnyURI() ;
+ else
+ // NORMAL
+ ch = readUnicodeEscape() ;
// Drop through.
}
// Ban certain very bad characters
- if ( ch == '<' )
+ if ( !VeryVeryLax && ch == '<' )
exception("Broken IRI (bad character: '"+(char)ch+"'): "+stringBuilder.toString(), y, x) ;
insertCodepoint(stringBuilder, ch) ;
}
}
+ private final
+ int readCharEscapeAnyURI()
+ {
+ int c = reader.readChar();
+ if ( c==EOF )
+ exception("Escape sequence not completed") ;
+
+ switch (c)
+ {
+ case 'u': return readUnicode4Escape();
+ case 'U': return readUnicode8Escape();
+ default:
+ // Anything \X
+ return c ;
+ }
+ }
+
private void readPrefixedNameOrKeyword(Token token)
{
long posn = reader.getPosition() ;