You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2020/06/28 09:23:03 UTC
[jena] branch master updated: JENA-1924: Test for ucschar in
tokenizer.
This is an automated email from the ASF dual-hosted git repository.
andy pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/jena.git
The following commit(s) were added to refs/heads/master by this push:
new f24f970 JENA-1924: Test for ucschar in tokenizer.
new 0f9cd55 Merge pull request #768 from afs/ucschar
f24f970 is described below
commit f24f9709861fac52c58ebb4fc45126db0e50d2c9
Author: Andy Seaborne <an...@apache.org>
AuthorDate: Thu Jun 25 18:56:16 2020 +0100
JENA-1924: Test for ucschar in tokenizer.
---
.../jena/atlas/json/io/parser/TokenizerJSON.java | 2 +-
.../main/java/org/apache/jena/riot/RDFParser.java | 10 +-
.../org/apache/jena/riot/lang/RiotParsers.java | 9 +-
.../jena/riot/system/ErrorHandlerFactory.java | 35 +++-
.../org/apache/jena/riot/system/RiotChars.java | 3 +-
.../jena/riot/system/stream/LocationMapper.java | 6 +
.../jena/riot/system/stream/StreamManager.java | 5 +
.../jena/riot/tokens/ErrorHandlerTokenizer.java | 38 ++++
.../java/org/apache/jena/riot/tokens/Token.java | 7 +-
.../jena/riot/tokens/TokenizeTextBuilder.java | 126 +++++++++++++
.../apache/jena/riot/tokens/TokenizerFactory.java | 63 ++++---
.../org/apache/jena/riot/tokens/TokenizerText.java | 196 ++++++++++++---------
.../jena/riot/lang/AbstractTestLangNTuples.java | 2 +-
.../org/apache/jena/riot/lang/TestLangTrig.java | 8 +-
.../org/apache/jena/riot/lang/TestLangTurtle.java | 7 +-
.../org/apache/jena/riot/tokens/TestTokenizer.java | 12 +-
jena-arq/testing/RIOT/Lang/Changes | 15 ++
jena-arq/testing/RIOT/Lang/TrigStd/manifest.ttl | 5 +-
jena-arq/testing/RIOT/Lang/TurtleStd/manifest.ttl | 5 +-
.../java/org/apache/jena/util/FileManagerImpl.java | 5 +-
.../main/java/org/apache/jena/dboe/sys/Sys.java | 6 +-
.../java/org/apache/jena/tdb2/sys/SystemTDB.java | 36 +---
.../test/java/org/apache/jena/tdb2/ConfigTest.java | 4 +-
.../java/org/apache/jena/fuseki/TestAdminAPI.java | 3 +-
.../main/java/org/apache/jena/iri/impl/Parser.java | 24 ++-
25 files changed, 431 insertions(+), 201 deletions(-)
diff --git a/jena-arq/src/main/java/org/apache/jena/atlas/json/io/parser/TokenizerJSON.java b/jena-arq/src/main/java/org/apache/jena/atlas/json/io/parser/TokenizerJSON.java
index 13eea4c..780822b 100644
--- a/jena-arq/src/main/java/org/apache/jena/atlas/json/io/parser/TokenizerJSON.java
+++ b/jena-arq/src/main/java/org/apache/jena/atlas/json/io/parser/TokenizerJSON.java
@@ -105,7 +105,7 @@ public class TokenizerJSON implements Tokenizer
public void remove()
{ throw new UnsupportedOperationException() ; }
- // ---- Machinary
+ // ---- Machinery
// ""-string, ''-string, *X,
// various single characters . , : ;
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/RDFParser.java b/jena-arq/src/main/java/org/apache/jena/riot/RDFParser.java
index e9318de..5794c65 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/RDFParser.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/RDFParser.java
@@ -375,11 +375,10 @@ public class RDFParser {
}
TypedInputStream in;
+ // Need more control than LocatorURL provides to get the Accept header in and the HttpCLient.
+ // So map now.
urlStr = streamManager.mapURI(urlStr);
if ( urlStr.startsWith("http://") || urlStr.startsWith("https://") ) {
- // Need more control than LocatorURL provides. We could use it for the
- // httpClient == null case.
- //
// HttpOp.execHttpGet(,acceptHeader,) overrides the HttpClient default setting.
//
// If there is an explicitly set HttpClient use that as given, and do not override
@@ -388,8 +387,9 @@ public class RDFParser {
String acceptHeader =
( httpClient == null ) ? WebContent.defaultRDFAcceptHeader : null;
in = HttpOp.execHttpGet(urlStr, acceptHeader, httpClient, null);
- } else {
- in = streamManager.open(urlStr);
+ } else {
+ // Already mapped.
+ in = streamManager.openNoMapOrNull(urlStr);
}
if ( in == null )
throw new RiotNotFoundException("Not found: "+urlStr);
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java b/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java
index 8e8df94..56d4035 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java
@@ -53,8 +53,7 @@ public class RiotParsers {
Tokenizer tokenizer = new TokenizerJSON(PeekReader.makeUTF8(input));
return createParserRdfJson(tokenizer, dest, profile);
}
-
- Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input);
+ Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input, profile.getErrorHandler());
if ( RDFLanguages.sameLang(TURTLE, lang) || RDFLanguages.sameLang(N3, lang) )
return createParserTurtle(tokenizer, dest, profile);
if ( RDFLanguages.sameLang(NTRIPLES, lang) )
@@ -74,7 +73,7 @@ public class RiotParsers {
}
@SuppressWarnings("deprecation")
- Tokenizer tokenizer = TokenizerFactory.makeTokenizer(input);
+ Tokenizer tokenizer = TokenizerFactory.makeTokenizer(input, profile.getErrorHandler());
if ( RDFLanguages.sameLang(TURTLE, lang) || RDFLanguages.sameLang(N3, lang) )
return createParserTurtle(tokenizer, dest, profile);
if ( RDFLanguages.sameLang(NTRIPLES, lang) )
@@ -121,7 +120,7 @@ public class RiotParsers {
/** Create an iterator for parsing N-Triples. */
public static Iterator<Triple> createIteratorNTriples(InputStream input, StreamRDF dest, ParserProfile profile) {
// LangNTriples supports iterator use.
- Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input);
+ Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input, profile.getErrorHandler());
return createParserNTriples(tokenizer, null, profile);
}
@@ -133,7 +132,7 @@ public class RiotParsers {
/** Create an iterator for parsing N-Quads. */
public static Iterator<Quad> createIteratorNQuads(InputStream input, StreamRDF dest, ParserProfile profile) {
// LangNQuads supports iterator use.
- Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input);
+ Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(input, profile.getErrorHandler());
return createParserNQuads(tokenizer, null, profile);
}
}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/ErrorHandlerFactory.java b/jena-arq/src/main/java/org/apache/jena/riot/system/ErrorHandlerFactory.java
index 8c9bd3e..e5c1633 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/ErrorHandlerFactory.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ErrorHandlerFactory.java
@@ -72,8 +72,13 @@ public class ErrorHandlerFactory
* An error handler that throws a {@link RiotParseException}, hence it
* exposes the details of errors.
*/
- public static ErrorHandler errorHandlerDetailed() { return new ErrorHandlerRiotParseException() ; }
+ public static ErrorHandler errorHandlerDetailed() { return new ErrorHandlerRiotParseErrors() ; }
+ /**
+ * An error handler that throws exceptions in all cases.
+ */
+ public static ErrorHandler errorHandlerExceptions() { return new ErrorHandlerRiotParseException() ; }
+
private static ErrorHandler defaultErrorHandler = errorHandlerStd ;
/** Get the current default error handler */
public static ErrorHandler getDefaultErrorHandler() { return defaultErrorHandler ; }
@@ -130,8 +135,9 @@ public class ErrorHandlerFactory
/** report a warning */
@Override
- public void warning(String message, long line, long col)
- { logWarning(message, line, col) ; }
+ public void warning(String message, long line, long col) {
+ logWarning(message, line, col);
+ }
/** report an error */
@Override
@@ -304,8 +310,10 @@ public class ErrorHandlerFactory
}
/** An error handler that throws a RiotParseException, hence it exposes the details of errors. */
- private static class ErrorHandlerRiotParseException implements ErrorHandler {
- public ErrorHandlerRiotParseException() {}
+ private static class ErrorHandlerRiotParseErrors implements ErrorHandler {
+
+ public ErrorHandlerRiotParseErrors() {}
+
@Override public void warning(String message, long line, long col) { }
@Override public void error(String message, long line, long col) {
@@ -316,5 +324,22 @@ public class ErrorHandlerFactory
throw new RiotParseException(message, line, col);
}
}
+
+ /** An error handler that throws a RiotParseException in all cases. */
+ private static class ErrorHandlerRiotParseException implements ErrorHandler {
+
+ public ErrorHandlerRiotParseException() {}
+
+ @Override public void warning(String message, long line, long col) {
+ throw new RiotParseException(message, line, col);
+ }
+ @Override public void error(String message, long line, long col) {
+ throw new RiotParseException(message, line, col);
+ }
+
+ @Override public void fatal(String message, long line, long col) {
+ throw new RiotParseException(message, line, col);
+ }
+ }
}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/RiotChars.java b/jena-arq/src/main/java/org/apache/jena/riot/system/RiotChars.java
index 4255384..449c52d 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/RiotChars.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/RiotChars.java
@@ -141,7 +141,8 @@ Notes: PN_CHARS_BASE has a hole above #xD800 -- these are the surrogate pairs
private static boolean r(int ch, int a, int b) { return ( ch >= a && ch <= b ); }
- public static boolean range(int ch, char a, char b) {
+ /** Test whether a codepoint is a given range (both ends inclusive)*/
+ public static boolean range(int ch, int a, int b) {
return (ch >= a && ch <= b);
}
}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/stream/LocationMapper.java b/jena-arq/src/main/java/org/apache/jena/riot/system/stream/LocationMapper.java
index 1412e91..70817e9 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/stream/LocationMapper.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/stream/LocationMapper.java
@@ -68,6 +68,10 @@ public class LocationMapper
this.altPrefixes.putAll(lmap2.altPrefixes) ;
}
+ public boolean containsMapping(String uri) {
+ return altMapping(uri, null) != null;
+ }
+
public String altMapping(String uri) {
return altMapping(uri, uri) ;
}
@@ -82,6 +86,8 @@ public class LocationMapper
* @return The alternative location chosen
*/
public String altMapping(String uri, String otherwise) {
+ if ( altLocations.isEmpty() && altPrefixes.isEmpty() )
+ return otherwise;
if ( altLocations.containsKey(uri) )
return altLocations.get(uri) ;
String newStart = null ;
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/stream/StreamManager.java b/jena-arq/src/main/java/org/apache/jena/riot/system/stream/StreamManager.java
index 2553a5c..ff6aa4e 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/system/stream/StreamManager.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/system/stream/StreamManager.java
@@ -132,6 +132,11 @@ public class StreamManager {
return openNoMapOrNull(uri) ;
}
+ /** Test whether a mapping exists */
+ public boolean hasMapping(String filenameOrURI) {
+ return mapper.containsMapping(filenameOrURI);
+ }
+
/** Apply the mapping of a filename or URI */
public String mapURI(String filenameOrURI) {
if ( mapper == null )
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/ErrorHandlerTokenizer.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/ErrorHandlerTokenizer.java
new file mode 100644
index 0000000..e48561a
--- /dev/null
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/ErrorHandlerTokenizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.riot.tokens;
+
+import org.apache.jena.riot.RiotParseException;
+import org.apache.jena.riot.system.ErrorHandler;
+
+public class ErrorHandlerTokenizer implements ErrorHandler {
+ @Override public void warning(String message, long line, long col) {
+ // Warning/continue.
+ //ErrorHandlerFactory.errorHandlerStd.warning(message, line, col);
+ throw new RiotParseException(message, line, col);
+ }
+
+ @Override public void error(String message, long line, long col) {
+ throw new RiotParseException(message, line, col);
+ }
+
+ @Override public void fatal(String message, long line, long col) {
+ throw new RiotParseException(message, line, col);
+ }
+}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Token.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Token.java
index d01cc9d..bd11ce0 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/Token.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/Token.java
@@ -26,7 +26,6 @@ import java.util.ArrayList ;
import java.util.List ;
import java.util.Objects ;
-import org.apache.jena.atlas.io.PeekReader ;
import org.apache.jena.atlas.iterator.Iter ;
import org.apache.jena.atlas.lib.Pair ;
import org.apache.jena.datatypes.RDFDatatype ;
@@ -105,8 +104,7 @@ public final class Token
static Token create(String s)
{
- PeekReader pr = PeekReader.readString(s) ;
- TokenizerText tt = new TokenizerText(pr) ;
+ Tokenizer tt = TokenizerText.create().fromString(s).build();
if ( ! tt.hasNext() )
throw new RiotException("No token") ;
Token t = tt.next() ;
@@ -117,8 +115,7 @@ public final class Token
static Iter<Token> createN(String s)
{
- PeekReader pr = PeekReader.readString(s) ;
- TokenizerText tt = new TokenizerText(pr) ;
+ Tokenizer tt = TokenizerText.create().fromString(s).build();
List<Token> x = new ArrayList<>() ;
while(tt.hasNext())
x.add(tt.next()) ;
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizeTextBuilder.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizeTextBuilder.java
new file mode 100644
index 0000000..985294d
--- /dev/null
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizeTextBuilder.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.riot.tokens;
+
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.jena.atlas.io.PeekReader;
+import org.apache.jena.atlas.lib.InternalErrorException;
+import org.apache.jena.riot.system.ErrorHandler;
+import org.apache.jena.riot.system.ErrorHandlerFactory;
+
+/** Builder for TokenizeText */
+public class TokenizeTextBuilder {
+
+ // One of these.
+ private PeekReader peekReader = null;
+ private InputStream input = null;
+ private Reader reader = null;
+ private String string = null;
+
+ private boolean lineMode = false;
+ private boolean utf8 = true;
+ private ErrorHandler errorHandler = null;
+
+ TokenizeTextBuilder() {}
+
+ private void clearInput() {
+ this.peekReader = null;
+ this.input = null;
+ this.reader = null;
+ this.string = null;
+ }
+
+ public TokenizeTextBuilder source(InputStream input) {
+ clearInput();
+ this.input = input;
+ return this;
+ }
+
+ public TokenizeTextBuilder source(Reader reader) {
+ clearInput();
+ this.reader = reader;
+ return this;
+ }
+
+ public TokenizeTextBuilder source(PeekReader peekReader) {
+ clearInput();
+ this.peekReader = peekReader;
+ return this;
+ }
+
+ public TokenizeTextBuilder fromString(String string) {
+ clearInput();
+ this.string = string;
+ return this;
+ }
+
+ public TokenizeTextBuilder lineMode(boolean lineMode) {
+ this.lineMode = lineMode;
+ return this;
+ }
+
+ public TokenizeTextBuilder asciiOnly(boolean asciiOnly) {
+ this.utf8 = !asciiOnly;
+ return this;
+ }
+
+ public TokenizeTextBuilder errorHandler(ErrorHandler errorHandler) {
+ this.errorHandler = errorHandler;
+ return this;
+ }
+
+ private static int countNulls(Object ... objs) {
+ int x = 0;
+ for ( Object obj : objs )
+ if ( obj == null )
+ x++;
+ return x;
+ }
+
+ private static int countNotNulls(Object ... objs) {
+ int x = 0;
+ for ( Object obj : objs )
+ if ( obj != null )
+ x++;
+ return x;
+ }
+
+ public Tokenizer build() {
+ ErrorHandler errHandler = (errorHandler != null) ? errorHandler : ErrorHandlerFactory.errorHandlerExceptions();
+ int x = countNotNulls(peekReader, input, reader, string);
+ if ( x > 1 )
+ throw new InternalErrorException("Too many data sources");
+ PeekReader pr;
+ if ( input != null ) {
+ pr = utf8 ? PeekReader.makeUTF8(input) : PeekReader.makeASCII(input);
+ } else if ( string != null ) {
+ pr = PeekReader.readString(string);
+ } else if ( reader != null ) {
+ pr = PeekReader.make(reader);
+ } else if ( peekReader != null ) {
+ pr = peekReader;
+ } else {
+ throw new IllegalStateException("No data source");
+ }
+
+ return TokenizerText.internal(pr, lineMode, errHandler);
+ }
+}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
index 222eb5b..ef2566ab 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerFactory.java
@@ -16,55 +16,62 @@
* limitations under the License.
*/
-package org.apache.jena.riot.tokens ;
+package org.apache.jena.riot.tokens;
-import java.io.ByteArrayInputStream ;
-import java.io.InputStream ;
-import java.io.Reader ;
-import java.io.StringReader ;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
-import org.apache.jena.atlas.io.PeekReader ;
-import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.riot.system.ErrorHandler;
public class TokenizerFactory {
-
+
+ private static ErrorHandler dftErrorHandler = null;
+
/** Discouraged - be careful about character sets */
@Deprecated
public static Tokenizer makeTokenizer(Reader reader) {
- PeekReader peekReader = PeekReader.make(reader) ;
- Tokenizer tokenizer = new TokenizerText(peekReader) ;
- return tokenizer ;
+ return TokenizerText.create().source(reader).build();
+ }
+
+ /** Discouraged - be careful about character sets */
+ @Deprecated
+ public static Tokenizer makeTokenizer(Reader reader, ErrorHandler errorHandler) {
+ return TokenizerText.create().source(reader).errorHandler(errorHandler).build();
}
/** Safe use of a StringReader */
public static Tokenizer makeTokenizer(StringReader reader) {
- PeekReader peekReader = PeekReader.make(reader) ;
- Tokenizer tokenizer = new TokenizerText(peekReader) ;
- return tokenizer ;
+ return TokenizerText.create().source(reader).build();
+ }
+
+ /** Safe use of a StringReader */
+ public static Tokenizer makeTokenizer(StringReader reader, ErrorHandler errorHandler) {
+ return TokenizerText.create().source(reader).errorHandler(errorHandler).build();
}
public static Tokenizer makeTokenizerUTF8(InputStream in) {
+ return makeTokenizerUTF8(in, dftErrorHandler);
+ }
+
+ public static Tokenizer makeTokenizerUTF8(InputStream input, ErrorHandler errorHandler) {
// BOM will be removed
- PeekReader peekReader = PeekReader.makeUTF8(in) ;
- Tokenizer tokenizer = new TokenizerText(peekReader) ;
- return tokenizer ;
+ return TokenizerText.create().source(input).errorHandler(errorHandler).build();
}
- public static Tokenizer makeTokenizerASCII(InputStream in) {
- PeekReader peekReader = PeekReader.makeASCII(in) ;
- Tokenizer tokenizer = new TokenizerText(peekReader) ;
- return tokenizer ;
+ public static Tokenizer makeTokenizerASCII(InputStream input) {
+ return TokenizerText.create().source(input).asciiOnly(true).build();
}
- public static Tokenizer makeTokenizerASCII(String string) {
- byte b[] = StrUtils.asUTF8bytes(string) ;
- ByteArrayInputStream in = new ByteArrayInputStream(b) ;
- return makeTokenizerASCII(in) ;
+ public static Tokenizer makeTokenizerASCII(InputStream input, ErrorHandler errorHandler) {
+ return TokenizerText.create().source(input).asciiOnly(true).errorHandler(errorHandler).build();
}
public static Tokenizer makeTokenizerString(String str) {
- PeekReader peekReader = PeekReader.readString(str) ;
- Tokenizer tokenizer = new TokenizerText(peekReader) ;
- return tokenizer ;
+ return TokenizerText.create().fromString(str).build();
+ }
+
+ public static Tokenizer makeTokenizerString(String str, ErrorHandler errorHandler) {
+ return TokenizerText.create().fromString(str).errorHandler(errorHandler).build();
}
}
diff --git a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
index 5215edd..a862978 100644
--- a/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
+++ b/jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java
@@ -22,6 +22,7 @@ import static org.apache.jena.atlas.lib.Chars.*;
import static org.apache.jena.riot.system.RiotChars.*;
import java.util.NoSuchElementException;
+import java.util.Objects;
import org.apache.jena.atlas.AtlasException;
import org.apache.jena.atlas.io.IO;
@@ -33,53 +34,40 @@ import org.apache.jena.riot.system.RiotChars;
import org.apache.jena.sparql.ARQInternalErrorException;
/** Tokenizer for all sorts of things RDF-ish */
-
public final class TokenizerText implements Tokenizer
{
// Drop through to final general symbol/keyword reader, including <=, !=
// Care with <=
// Policy driven for CURIES?
- // Various allow/deny options (via checker?)
-
- // Space for CURIEs, stricter Turtle QNames, sane Turtle (i.e. leading digits in local part).
public static final int CTRL_CHAR = CH_STAR;
- public static boolean Checking = false;
+ // The code has the call points for checking tokens but it is generally better to
+ // do the check later in the parsing process. In case a need arises, the code
+ // remains, all compiled away by "if ( false )" (javac does not generate any
+ // bytecodes and even if it it did, JIT will remove dead branches.
+ private static final boolean Checking = false;
private Token token = null;
private final StringBuilder stringBuilder = new StringBuilder(200);
private final PeekReader reader;
- private final boolean lineMode; // Whether whitespace includes or excludes NL (in its various forms).
+ // Whether whitespace between tokens includes newlines (in various forms).
+ private final boolean lineMode;
private boolean finished = false;
private TokenChecker checker = null;
- private static class ErrorHandlerTokenizer implements ErrorHandler {
- @Override public void warning(String message, long line, long col) {
- // Warning/continue.
- //ErrorHandlerFactory.errorHandlerStd.warning(message, line, col);
- throw new RiotParseException(message, line, col);
- }
-
- @Override public void error(String message, long line, long col) {
- throw new RiotParseException(message, line, col);
- }
-
- @Override public void fatal(String message, long line, long col) {
- throw new RiotParseException(message, line, col);
- }
- };
// The code assumes that errors throw exception and so stop parsing.
- private static final ErrorHandler defaultErrorHandler = new ErrorHandlerTokenizer();
- private ErrorHandler errorHandler = defaultErrorHandler;
+ private final ErrorHandler errorHandler;
- /*package*/ TokenizerText(PeekReader reader) {
- this(reader, false);
+ public static TokenizeTextBuilder create() { return new TokenizeTextBuilder() ; }
+
+ /*package*/ static TokenizerText internal(PeekReader reader, boolean lineMode, ErrorHandler errorHandler) {
+ return new TokenizerText(reader, lineMode, errorHandler);
}
-
- /*package*/ TokenizerText(PeekReader reader, boolean lineMode) {
- this.reader = reader;
+ private TokenizerText(PeekReader reader, boolean lineMode, ErrorHandler errorHandler) {
+ this.reader = Objects.requireNonNull(reader, "PeekReader");
this.lineMode = lineMode;
+ this.errorHandler = Objects.requireNonNull(errorHandler, "ErrorHandler");
}
@Override
@@ -114,7 +102,6 @@ public final class TokenizerText implements Tokenizer
}
}
-
@Override
public final boolean eof() {
return !hasNext();
@@ -140,28 +127,28 @@ public final class TokenizerText implements Tokenizer
public void remove()
{ throw new UnsupportedOperationException(); }
- public TokenChecker getChecker() {
- return checker;
- }
-
- public void setChecker(TokenChecker checker) {
- this.checker = checker;
- }
-
- public ErrorHandler getErrorHandler() {
- return errorHandler;
- }
-
- public void setErrorHandler(ErrorHandler handler) {
- this.errorHandler = handler;
- }
+// private TokenChecker getChecker() {
+// return checker;
+// }
+//
+// private void setChecker(TokenChecker checker) {
+// this.checker = checker;
+// }
+//
+// private ErrorHandler getErrorHandler() {
+// return errorHandler;
+// }
+//
+// private void setErrorHandler(ErrorHandler handler) {
+// this.errorHandler = handler;
+// }
@Override
public void close() {
IO.close(reader);
}
- // ---- Machinary
+ // ---- Machinery
private void skip() {
int ch = EOF;
@@ -217,7 +204,7 @@ public final class TokenizerText implements Tokenizer
//token.setImage("<<");
return token;
}
- error("Internal error - parsed '"+chPeek+"' after '<'");
+ fatal("Internal error - parsed '"+chPeek+"' after '<'");
}
// ---- Literal
@@ -288,7 +275,7 @@ public final class TokenizerText implements Tokenizer
Token subToken = parseToken();
if ( !subToken.isIRI() )
- error("Datatype URI required after ^^ - URI or prefixed name expected");
+ fatal("Datatype URI required after ^^ - URI or prefixed name expected");
mainToken.setSubToken2(subToken);
mainToken.setType(TokenType.LITERAL_DT);
@@ -385,14 +372,14 @@ public final class TokenizerText implements Tokenizer
case CH_VBAR: reader.readChar(); token.setType(TokenType.VBAR); /*token.setImage(CH_VBAR);*/ return token;
case CH_AMPHERSAND: reader.readChar(); token.setType(TokenType.AMPHERSAND);/*token.setImage(CH_AMPHERSAND);*/ return token;
// Specials (if blank node processing off)
- //case CH_COLON: reader.readChar(); token.setType(TokenType.COLON); return token;
+ //case CH_COLON: reader.readChar(); token.setType(TokenType.COLON); /*token.setImage(COLON);*/return token;
// Done above with blank nodes.
- //case CH_UNDERSCORE: reader.readChar(); token.setType(TokenType.UNDERSCORE); /*token.setImage(CH_UNDERSCORE);*/ return token;
- case CH_LT: reader.readChar(); token.setType(TokenType.LT); /*token.setImage(CH_LT);*/ return token;
- case CH_STAR: reader.readChar(); token.setType(TokenType.STAR); /*token.setImage(CH_STAR);*/ return token;
+ //case CH_UNDERSCORE: reader.readChar(); token.setType(TokenType.UNDERSCORE);/*token.setImage(CH_UNDERSCORE);*/ return token;
+ case CH_LT: reader.readChar(); token.setType(TokenType.LT); /*token.setImage(CH_LT);*/ return token;
+ case CH_STAR: reader.readChar(); token.setType(TokenType.STAR); /*token.setImage(CH_STAR);*/ return token;
- // XXX Multi character symbols
+ // XXX Multi-character symbols
// Two character tokens && || GE >= , LE <=
//TokenType.LE
//TokenType.GE
@@ -483,11 +470,11 @@ public final class TokenizerText implements Tokenizer
int ch = reader.readChar();
switch(ch) {
case EOF:
- error("Broken IRI (End of file)"); return null;
+ fatal("Broken IRI (End of file)"); return null;
case NL:
- error("Broken IRI (newline): %s", stringBuilder.toString()); return null;
+ fatal("Broken IRI (newline): %s", stringBuilder.toString()); return null;
case CR:
- error("Broken IRI (CR): %s", stringBuilder.toString()); return null;
+ fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null;
case CH_GT:
// Done!
return stringBuilder.toString();
@@ -503,8 +490,8 @@ public final class TokenizerText implements Tokenizer
// Bad characters will lead to trouble elsewhere.
break;
case CH_LT:
- // Probably a corrupt file so not a warning.
- error("Bad character in IRI (bad character: '<'): <%s[<]...>", stringBuilder.toString()); return null;
+ // Probably a corrupt file so treat as fatal.
+ fatal("Bad character in IRI (bad character: '<'): <%s[<]...>", stringBuilder.toString()); return null;
case TAB:
error("Bad character in IRI (Tab character): <%s[tab]...>", stringBuilder.toString()); return null;
case '{': case '}': case '"': case '|': case '^': case '`' :
@@ -513,27 +500,55 @@ public final class TokenizerText implements Tokenizer
break;
case SPC:
if ( ! AllowSpacesInIRI )
+ error("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString());
+ else
warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString());
break;
default:
if ( ch <= 0x19 )
warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString(), ch);
+
}
+ // JENA-1924: jena-iri does not catch this.
+ if ( ! VeryVeryLaxIRI && ch >= 0xA0 && ! isUcsChar(ch) )
+ warning("Illegal character in IRI (Not a ucschar: 0x%04X): <%s[U+%04X]...>", ch, stringBuilder.toString(), ch);
insertCodepoint(stringBuilder, ch);
}
}
+ private static boolean isUcsChar(int ch) {
+ // RFC 3987
+ // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
+ // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
+ // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
+ // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
+ // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
+ // / %xD0000-DFFFD / %xE1000-EFFFD
+ boolean b = range(ch, 0xA0, 0xD7FF) || range(ch, 0xF900, 0xFDCF) || range(ch, 0xFDF0, 0xFFEF);
+ if ( b )
+ return true;
+ if ( ch < 0x1000 )
+ return false;
+ // 32 bit checks.
+ return
+ range(ch, 0x10000, 0x1FFFD) || range(ch, 0x20000, 0x2FFFD) || range(ch, 0x30000, 0x3FFFD) ||
+ range(ch, 0x40000, 0x4FFFD) || range(ch, 0x50000, 0x5FFFD) || range(ch, 0x60000, 0x6FFFD) ||
+ range(ch, 0x70000, 0x7FFFD) || range(ch, 0x80000, 0x8FFFD) || range(ch, 0x90000, 0x9FFFD) ||
+ range(ch, 0xA0000, 0xAFFFD) || range(ch, 0xB0000, 0xBFFFD) || range(ch, 0xC0000, 0xCFFFD) ||
+ range(ch, 0xD0000, 0xDFFFD) || range(ch, 0xE1000, 0xEFFFD);
+ }
+
// Read a unicode escape : does not allow \\ bypass
private final int readUnicodeEscape() {
int ch = reader.readChar();
if ( ch == EOF )
- error("Broken escape sequence");
+ fatal("Broken escape sequence");
switch (ch) {
case 'u': return readUnicode4Escape();
case 'U': return readUnicode8Escape();
default:
- error("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch);
+ fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch);
}
return 0;
}
@@ -556,7 +571,7 @@ public final class TokenizerText implements Tokenizer
// If we made no progress, nothing found, not even a keyword -- it's an
// error.
if ( posn == reader.getPosition() )
- error("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch);
+ fatal("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch);
if ( Checking )
checkKeyword(token.getImage());
@@ -681,13 +696,13 @@ public final class TokenizerText implements Tokenizer
ch = reader.peekChar();
if ( ! isHexChar(ch) )
- error("Not a hex character: '%c'",ch);
+ fatal("Not a hex character: '%c'",ch);
stringBuilder.append((char)ch);
reader.readChar();
ch = reader.peekChar();
if ( ! isHexChar(ch) )
- error("Not a hex character: '%c'",ch);
+ fatal("Not a hex character: '%c'",ch);
stringBuilder.append((char)ch);
reader.readChar();
}
@@ -713,11 +728,11 @@ public final class TokenizerText implements Tokenizer
int ch = reader.readChar();
if ( ch == EOF ) {
// if ( endNL ) return stringBuilder.toString();
- error("Broken token: " + stringBuilder.toString(), y, x);
+ fatal("Broken token: " + stringBuilder.toString(), y, x);
}
if ( ch == NL )
- error("Broken token (newline): " + stringBuilder.toString(), y, x);
+ fatal("Broken token (newline): " + stringBuilder.toString(), y, x);
if ( ch == endCh ) {
return stringBuilder.toString();
@@ -736,7 +751,7 @@ public final class TokenizerText implements Tokenizer
if ( ch == EOF ) {
if ( endNL )
return stringBuilder.toString();
- error("Broken long string");
+ fatal("Broken long string");
}
if ( ch == quoteChar ) {
@@ -828,14 +843,14 @@ public final class TokenizerText implements Tokenizer
{
int ch = reader.peekChar();
if ( ch == EOF )
- error("Blank node label missing (EOF found)");
+ fatal("Blank node label missing (EOF found)");
if ( isWhitespace(ch) )
- error("Blank node label missing");
+ fatal("Blank node label missing");
// if ( ! isAlpha(ch) && ch != '_' )
// Not strict
if ( !RiotChars.isPNChars_U_N(ch) )
- error("Blank node label does not start with alphabetic or _ :" + (char)ch);
+ fatal("Blank node label does not start with alphabetic or _ :" + (char)ch);
reader.readChar();
stringBuilder.append((char)ch);
}
@@ -936,7 +951,7 @@ public final class TokenizerText implements Tokenizer
if ( x == 0 && !isDecimal )
// Possible a tokenizer error - should not have entered readNumber
// in the first place.
- error("Unrecognized as number");
+ fatal("Unrecognized as number");
if ( exponent(stringBuilder) ) {
isDouble = true;
@@ -975,7 +990,7 @@ public final class TokenizerText implements Tokenizer
x++;
}
if ( x == 0 )
- error("No hex characters after " + sb.toString());
+ fatal("No hex characters after " + sb.toString());
}
private int readDigits(StringBuilder buffer) {
@@ -1033,7 +1048,7 @@ public final class TokenizerText implements Tokenizer
readPossibleSign(sb);
int x = readDigits(sb);
if ( x == 0 )
- error("Malformed double: " + sb);
+ fatal("Malformed double: " + sb);
return true;
}
@@ -1041,7 +1056,7 @@ public final class TokenizerText implements Tokenizer
stringBuilder.setLength(0);
a2z(stringBuilder);
if ( stringBuilder.length() == 0 )
- error("Bad language tag");
+ fatal("Bad language tag");
for (;;) {
int ch = reader.peekChar();
if ( ch == '-' ) {
@@ -1050,7 +1065,7 @@ public final class TokenizerText implements Tokenizer
int x = stringBuilder.length();
a2zN(stringBuilder);
if ( stringBuilder.length() == x )
- error("Bad language tag");
+ fatal("Bad language tag");
} else
break;
}
@@ -1087,7 +1102,7 @@ public final class TokenizerText implements Tokenizer
// Convert to UTF-16. Note that the rest of any system this is used
// in must also respect codepoints and surrogate pairs.
if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) )
- error("Illegal codepoint: 0x%04X", ch);
+ fatal("Illegal codepoint: 0x%04X", ch);
char[] chars = Character.toChars(ch);
buffer.append(chars);
}
@@ -1165,7 +1180,7 @@ public final class TokenizerText implements Tokenizer
private final int readLiteralEscape() {
int c = reader.readChar();
if ( c == EOF )
- error("Escape sequence not completed");
+ fatal("Escape sequence not completed");
switch (c) {
case 'n': return NL;
@@ -1179,7 +1194,7 @@ public final class TokenizerText implements Tokenizer
case 'u': return readUnicode4Escape();
case 'U': return readUnicode8Escape();
default:
- error("Illegal escape sequence value: %c (0x%02X)", c, c);
+ fatal("Illegal escape sequence value: %c (0x%02X)", c, c);
return 0;
}
}
@@ -1191,7 +1206,7 @@ public final class TokenizerText implements Tokenizer
int c = reader.readChar();
if ( c == EOF )
- error("Escape sequence not completed");
+ fatal("Escape sequence not completed");
switch (c) {
case '_': case '~': case '.': case '-': case '!': case '$': case '&':
@@ -1200,7 +1215,7 @@ public final class TokenizerText implements Tokenizer
case '=': case '/': case '?': case '#': case '@': case '%':
return c;
default:
- error("illegal character escape value: \\%c", c);
+ fatal("illegal character escape value: \\%c", c);
return 0;
}
}
@@ -1211,7 +1226,7 @@ public final class TokenizerText implements Tokenizer
private final int readUnicode8Escape() {
int ch8 = readHexSequence(8);
if ( ch8 > Character.MAX_CODE_POINT )
- error("Illegal code point in \\U sequence value: 0x%08X", ch8);
+ fatal("Illegal code point in \\U sequence value: 0x%08X", ch8);
return ch8;
}
@@ -1229,12 +1244,12 @@ public final class TokenizerText implements Tokenizer
private final int readHexChar() {
int ch = reader.readChar();
if ( ch == EOF )
- error("Not a hexadecimal character (end of file)");
+ fatal("Not a hexadecimal character (end of file)");
int x = valHexChar(ch);
if ( x != -1 )
return x;
- error("Not a hexadecimal character: " + (char)ch);
+ fatal("Not a hexadecimal character: " + (char)ch);
return -1;
}
@@ -1242,12 +1257,12 @@ public final class TokenizerText implements Tokenizer
for (int i = 0; i < str.length(); i++) {
char want = str.charAt(i);
if ( reader.eof() ) {
- error("End of input during expected string: " + str);
+ fatal("End of input during expected string: " + str);
return false;
}
int inChar = reader.peekChar();
if ( inChar != want ) {
- error("expected \"" + str + "\"");
+ fatal("expected \"" + str + "\"");
return false;
}
reader.readChar();
@@ -1255,17 +1270,28 @@ public final class TokenizerText implements Tokenizer
return true;
}
+ /** Warning - can continue. */
private void warning(String message, Object... args) {
String msg = String.format(message, args);
errorHandler.warning(msg, reader.getLineNum(), reader.getColNum());
- //exception(message, args);
}
+ /** Error - at the tokenizer level, it can continue (with some junk) but it is a serious error and the
+ * caller probably should treat as an error and stop.
+ * @param message
+ * @param args
+ */
private void error(String message, Object... args) {
String msg = String.format(message, args);
+ errorHandler.error(msg, reader.getLineNum(), reader.getColNum());
+ }
+
+ /** Structural error - unrecoverable - but reported as ERROR (FATAL can imply system fault) */
+ private void fatal(String message, Object... args) {
+ String msg = String.format(message, args);
long line = reader.getLineNum();
long col = reader.getColNum();
- errorHandler.error(msg, line, col);
+ errorHandler.fatal(msg, line, col);
// We require that errors cause the tokenizer to stop so in case the
// provided error handler does not, we throw an exception.
throw new RiotParseException(message, line, col);
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/AbstractTestLangNTuples.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/AbstractTestLangNTuples.java
index d9fd15a..3e6584e 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/AbstractTestLangNTuples.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/AbstractTestLangNTuples.java
@@ -131,7 +131,7 @@ abstract public class AbstractTestLangNTuples
}
// Bad terms - but accepted by default.
- @Test(expected = ExFatal.class)
+ @Test(expected = ExError.class)
public void tuple_bad_10() {
parseCount("<x> <p> <bad uri> .");
}
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
index abd4a4d..ccc2ffa 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTrig.java
@@ -22,7 +22,7 @@ import static org.junit.Assert.assertEquals;
import org.apache.jena.graph.Triple ;
import org.apache.jena.riot.ErrorHandlerTestLib ;
-import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExError;
import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ;
import org.apache.jena.riot.Lang ;
import org.apache.jena.sparql.core.DatasetGraph ;
@@ -67,13 +67,13 @@ public class TestLangTrig
// Also need to check that the RiotExpection is called in normal use.
// Bad terms.
- @Test (expected=ExFatal.class)
+ @Test (expected=ExError.class)
public void trig_20() { parse("@prefix ex: <bad iri> .", "{ ex:s ex:p 123 }") ; }
- @Test (expected=ExFatal.class)
+ @Test (expected=ExError.class)
public void trig_21() { parse("@prefix ex: <http://example/> .", "{ ex:s <http://example/broken p> 123 }") ; }
- @Test (expected=ExFatal.class)
+ @Test (expected=ExError.class)
public void trig_22() { parse("{ <x> <p> 'number'^^<bad uri> }") ; }
@Test (expected=ExWarning.class)
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
index b4bb87b..f7c66f0 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangTurtle.java
@@ -33,6 +33,7 @@ import org.apache.jena.rdf.model.Model ;
import org.apache.jena.rdf.model.ModelFactory ;
import org.apache.jena.rdf.model.Property ;
import org.apache.jena.rdf.model.Resource ;
+import org.apache.jena.riot.ErrorHandlerTestLib.ExError;
import org.apache.jena.riot.ErrorHandlerTestLib.ExFatal ;
import org.apache.jena.riot.ErrorHandlerTestLib.ExWarning ;
import org.apache.jena.riot.Lang ;
@@ -157,7 +158,7 @@ public class TestLangTurtle
@Test(expected=ExFatal.class)
public void errorBadDatatype() { parse("<p> <p> 'q'^^.") ; }
- @Test(expected=ExFatal.class)
+ @Test(expected=ExError.class)
public void errorBadURI_1()
{ parse("<http://example/a b> <http://example/p> 123 .") ; }
@@ -171,10 +172,10 @@ public class TestLangTurtle
{ parse("<http://example/a%Aab> <http://example/p> 123 .") ; }
// Bad URIs
- @Test (expected=ExFatal.class)
+ @Test (expected=ExError.class)
public void errorBadURI_4() { parse("@prefix ex: <bad iri> . ex:s ex:p 123 ") ; }
- @Test (expected=ExFatal.class)
+ @Test (expected=ExError.class)
public void errorBadURI_5() { parse("<x> <p> 'number'^^<bad uri> ") ; }
@Test (expected=ExFatal.class)
diff --git a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
index 344cb97..9621ddd 100644
--- a/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
+++ b/jena-arq/src/test/java/org/apache/jena/riot/tokens/TestTokenizer.java
@@ -33,14 +33,14 @@ import org.apache.jena.sparql.ARQConstants ;
import org.junit.Test ;
public class TestTokenizer {
- // WORKERS
+
private static Tokenizer tokenizer(String string) {
return tokenizer(string, false) ;
}
private static Tokenizer tokenizer(String string, boolean lineMode) {
PeekReader r = PeekReader.readString(string) ;
- Tokenizer tokenizer = new TokenizerText(r, lineMode) ;
+ Tokenizer tokenizer = TokenizerText.create().source(r).lineMode(lineMode).build();
return tokenizer ;
}
@@ -1109,7 +1109,7 @@ public class TestTokenizer {
@Test
public void token_rdf_star_1() {
- Tokenizer tokenizer = tokenizer("<<>>", true) ;
+ Tokenizer tokenizer = tokenizer("<<>>") ;
testNextToken(tokenizer, TokenType.LT2) ;
testNextToken(tokenizer, TokenType.GT2) ;
assertFalse(tokenizer.hasNext()) ;
@@ -1117,7 +1117,7 @@ public class TestTokenizer {
@Test
public void token_rdf_star_2() {
- Tokenizer tokenizer = tokenizer("<< >>", true) ;
+ Tokenizer tokenizer = tokenizer("<< >>") ;
testNextToken(tokenizer, TokenType.LT2) ;
testNextToken(tokenizer, TokenType.GT2) ;
assertFalse(tokenizer.hasNext()) ;
@@ -1125,7 +1125,7 @@ public class TestTokenizer {
@Test
public void token_rdf_star_3() {
- Tokenizer tokenizer = tokenizer("<<:s x:p 123>> :q ", true) ;
+ Tokenizer tokenizer = tokenizer("<<:s x:p 123>> :q ") ;
testNextToken(tokenizer, TokenType.LT2) ;
testNextToken(tokenizer, TokenType.PREFIXED_NAME, "", "s") ;
testNextToken(tokenizer, TokenType.PREFIXED_NAME, "x", "p") ;
@@ -1137,7 +1137,7 @@ public class TestTokenizer {
@Test
public void token_rdf_star_4() {
- Tokenizer tokenizer = tokenizer("<<<>>>", true) ;
+ Tokenizer tokenizer = tokenizer("<<<>>>") ;
testNextToken(tokenizer, TokenType.LT2) ;
Token t = testNextToken(tokenizer, TokenType.IRI) ;
assertEquals("", t.getImage());
diff --git a/jena-arq/testing/RIOT/Lang/Changes b/jena-arq/testing/RIOT/Lang/Changes
new file mode 100644
index 0000000..74211c1
--- /dev/null
+++ b/jena-arq/testing/RIOT/Lang/Changes
@@ -0,0 +1,15 @@
+Tests localName_with_nfc_PN_CHARS_BASE_character_boundaries
+in Turtle and Trig contain IRIs with the character \U000E01EF
+in the result nt/nq files.
+
+That character is illegal in IRIs, even if allowed by syntax.
+So it causes a failure whn reading the test.
+
+It is not in RFC 3987 - the block E0000-E0FFF is excluded.
+
+ ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
+ / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
+ / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
+ / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
+ / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
+ / %xD0000-DFFFD / %xE1000-EFFFD
diff --git a/jena-arq/testing/RIOT/Lang/TrigStd/manifest.ttl b/jena-arq/testing/RIOT/Lang/TrigStd/manifest.ttl
index 2b21df6..7289612 100644
--- a/jena-arq/testing/RIOT/Lang/TrigStd/manifest.ttl
+++ b/jena-arq/testing/RIOT/Lang/TrigStd/manifest.ttl
@@ -53,7 +53,10 @@
<#underscore_in_localName>
<#localname_with_COLON>
<#localName_with_assigned_nfc_bmp_PN_CHARS_BASE_character_boundaries>
- <#localName_with_assigned_nfc_PN_CHARS_BASE_character_boundaries>
+
+ ## Contains \U000E01EF in the result which is not legal in a IRI.
+ ## <#localName_with_assigned_nfc_PN_CHARS_BASE_character_boundaries>
+
<#localName_with_nfc_PN_CHARS_BASE_character_boundaries>
<#localName_with_leading_underscore>
<#localName_with_leading_digit>
diff --git a/jena-arq/testing/RIOT/Lang/TurtleStd/manifest.ttl b/jena-arq/testing/RIOT/Lang/TurtleStd/manifest.ttl
index cc07d8f..807d9e5 100644
--- a/jena-arq/testing/RIOT/Lang/TurtleStd/manifest.ttl
+++ b/jena-arq/testing/RIOT/Lang/TurtleStd/manifest.ttl
@@ -41,7 +41,10 @@
<#underscore_in_localName>
<#localname_with_COLON>
<#localName_with_assigned_nfc_bmp_PN_CHARS_BASE_character_boundaries>
- <#localName_with_assigned_nfc_PN_CHARS_BASE_character_boundaries>
+
+ ## Contains \U000E01EF in the result which is not legal in a IRI.
+ ## <#localName_with_assigned_nfc_PN_CHARS_BASE_character_boundaries>
+
<#localName_with_nfc_PN_CHARS_BASE_character_boundaries>
<#localName_with_leading_underscore>
<#localName_with_leading_digit>
diff --git a/jena-core/src/main/java/org/apache/jena/util/FileManagerImpl.java b/jena-core/src/main/java/org/apache/jena/util/FileManagerImpl.java
index e4fab09..11bc695 100644
--- a/jena-core/src/main/java/org/apache/jena/util/FileManagerImpl.java
+++ b/jena-core/src/main/java/org/apache/jena/util/FileManagerImpl.java
@@ -114,7 +114,10 @@ public class FileManagerImpl implements FileManager
}
/** Create with the given location mapper */
- protected FileManagerImpl(LocationMapper _mapper) { setLocationMapper(_mapper) ; }
+ protected FileManagerImpl(LocationMapper _mapper) {
+ this();
+ setLocationMapper(_mapper);
+ }
@Override
public FileManager clone() { return clone(this) ; }
diff --git a/jena-db/jena-dboe-base/src/main/java/org/apache/jena/dboe/sys/Sys.java b/jena-db/jena-dboe-base/src/main/java/org/apache/jena/dboe/sys/Sys.java
index cfd7c47..dca2415 100644
--- a/jena-db/jena-dboe-base/src/main/java/org/apache/jena/dboe/sys/Sys.java
+++ b/jena-db/jena-dboe-base/src/main/java/org/apache/jena/dboe/sys/Sys.java
@@ -30,7 +30,7 @@ import org.slf4j.LoggerFactory;
/** Low level environment */
public class Sys
{
- static final Logger log = LoggerFactory.getLogger("Sys");
+ static final Logger log = LoggerFactory.getLogger("org.apache.jena.dboe.Sys");
/** System log - use for general messages (a few) and warnings.
* Generally, do not log events unless you want every user to see them every time.
@@ -39,9 +39,9 @@ public class Sys
*/
/** General system log */
- public static final Logger syslog = LoggerFactory.getLogger("System");
+ public static final Logger syslog = LoggerFactory.getLogger("org.apache.jena.dboe.System");
/** Send warnings and error */
- public static final Logger errlog = LoggerFactory.getLogger("System");
+ public static final Logger errlog = LoggerFactory.getLogger("org.apache.jena.dboe.System");
/** Size, in bytes, of a Java long */
public static final int SizeOfLong = Long.BYTES; // Long.SIZE/Byte.SIZE ;
diff --git a/jena-db/jena-tdb2/src/main/java/org/apache/jena/tdb2/sys/SystemTDB.java b/jena-db/jena-tdb2/src/main/java/org/apache/jena/tdb2/sys/SystemTDB.java
index fe4209e..0107729 100644
--- a/jena-db/jena-tdb2/src/main/java/org/apache/jena/tdb2/sys/SystemTDB.java
+++ b/jena-db/jena-tdb2/src/main/java/org/apache/jena/tdb2/sys/SystemTDB.java
@@ -135,7 +135,7 @@ public class SystemTDB
propertyFileName = x;
}
- public static final boolean is64bitSystem = determineIf64Bit();
+ public static final boolean is64bitSystem = Sys.is64bitSystem;
private static Properties properties = readPropertiesFile();
@@ -314,40 +314,6 @@ public class SystemTDB
return p;
}
- // --------
-
- public static final boolean isWindows = determineIfWindows(); // Memory mapped files behave differently.
-
- //Or look in File.listRoots.
- //Alternative method:
- // http://stackoverflow.com/questions/1293533/name-of-the-operating-system-in-java-not-os-name
-
- private static boolean determineIfWindows() {
- String s = System.getProperty("os.name");
- if ( s == null )
- return false;
- return s.startsWith("Windows ");
- }
-
- private static boolean determineIf64Bit() {
- String s = System.getProperty("sun.arch.data.model");
- if ( s != null ) {
- boolean b = s.equals("64");
- TDB2.logInfo.debug("System architecture: " + (b ? "64 bit" : "32 bit"));
- return b;
- }
- // Not a SUN VM
- s = System.getProperty("java.vm.info");
- if ( s == null ) {
- log.warn("Can't determine the data model");
- return false;
- }
- log.debug("Can't determine the data model from 'sun.arch.data.model' - using java.vm.info");
- boolean b = s.contains("64");
- TDB2.logInfo.debug("System architecture: (from java.vm.info) " + (b ? "64 bit" : "32 bit"));
- return b;
- }
-
// ---- File mode
private static FileMode fileMode = null;
diff --git a/jena-db/jena-tdb2/src/test/java/org/apache/jena/tdb2/ConfigTest.java b/jena-db/jena-tdb2/src/test/java/org/apache/jena/tdb2/ConfigTest.java
index e1d2f59..937146c 100644
--- a/jena-db/jena-tdb2/src/test/java/org/apache/jena/tdb2/ConfigTest.java
+++ b/jena-db/jena-tdb2/src/test/java/org/apache/jena/tdb2/ConfigTest.java
@@ -19,7 +19,7 @@
package org.apache.jena.tdb2;
import org.apache.jena.atlas.lib.FileOps;
-import org.apache.jena.tdb2.sys.SystemTDB;
+import org.apache.jena.base.Sys;
public class ConfigTest
{
@@ -27,7 +27,7 @@ public class ConfigTest
// Place under target
private static final String testingDir = "target/tdb-testing";
private static final String testingDirDB = "target/tdb-testing/DB";
- static boolean nonDeleteableMMapFiles = SystemTDB.isWindows;
+ static boolean nonDeleteableMMapFiles = Sys.isWindows;
static boolean initialized = false;
diff --git a/jena-fuseki2/jena-fuseki-webapp/src/test/java/org/apache/jena/fuseki/TestAdminAPI.java b/jena-fuseki2/jena-fuseki-webapp/src/test/java/org/apache/jena/fuseki/TestAdminAPI.java
index d895a65..b453a51 100644
--- a/jena-fuseki2/jena-fuseki-webapp/src/test/java/org/apache/jena/fuseki/TestAdminAPI.java
+++ b/jena-fuseki2/jena-fuseki-webapp/src/test/java/org/apache/jena/fuseki/TestAdminAPI.java
@@ -34,6 +34,7 @@ import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.jena.atlas.web.HttpException;
import org.apache.jena.atlas.web.TypedInputStream;
+import org.apache.jena.base.Sys;
import org.apache.jena.fuseki.webapp.FusekiWebapp;
import org.apache.jena.query.QueryExecution;
import org.apache.jena.rdfconnection.RDFConnection;
@@ -61,7 +62,7 @@ public class TestAdminAPI extends AbstractFusekiTest {
@Test public void add_delete_api_3() throws Exception {
// Deleted mmap files on Windows does not go away until the JVM exits.
- if ( org.apache.jena.tdb2.sys.SystemTDB.isWindows )
+ if ( Sys.isWindows )
return;
testAddDelete("db_tdb2", "tdb2", true);
}
diff --git a/jena-iri/src/main/java/org/apache/jena/iri/impl/Parser.java b/jena-iri/src/main/java/org/apache/jena/iri/impl/Parser.java
index 61755ad..18c55fc 100644
--- a/jena-iri/src/main/java/org/apache/jena/iri/impl/Parser.java
+++ b/jena-iri/src/main/java/org/apache/jena/iri/impl/Parser.java
@@ -18,14 +18,10 @@
package org.apache.jena.iri.impl;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.io.Reader;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
+import java.io.*;
import java.net.IDN;
import org.apache.jena.iri.* ;
@@ -227,15 +223,27 @@ public class Parser implements IRIComponents, ViolationCodes {
}
}
+ static public void devParse(String uriStr) throws IOException {
+ LineNumberReader in = new LineNumberReader(new StringReader(uriStr));
+ devParse(in);
+ }
+
static public void main(String args[]) throws IOException {
- LineNumberReader in = new LineNumberReader(new InputStreamReader(
- System.in));
+ LineNumberReader in = new LineNumberReader(new InputStreamReader(System.in));
+ devParse(in);
+ }
+
+ static private void devParse(LineNumberReader in) throws IOException {
+
IRIImpl last = null;
DEBUG = true;
IRIFactory factory = IRIFactory.iriImplementation();
while (true) {
- String s = in.readLine().trim();
+ String s = in.readLine();
+ if ( s == null )
+ return;
+ s = s.trim();
if (s.equals("quit"))
return;
IRIImpl iri = (IRIImpl) factory.create(s);