You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pr@jena.apache.org by GitBox <gi...@apache.org> on 2021/03/01 13:23:21 UTC

[GitHub] [jena] afs opened a new pull request #940: JENA-2031: Refine IRI settings

afs opened a new pull request #940:
URL: https://github.com/apache/jena/pull/940


   This is work inspired by parsing wikidata with Jena3, finding all the types of warnings and errors that arise then making the new code agree with Jena3, except where newer RFCs have changed the situation (e.g. percent encoding in DNS host names is now legal). The code also reflect jena's behaviour of warning about bad IRIs but not signalling a parse error - very large datasets have some less-than-perfect IRIs in them and aborting a load because of this is annoying.
   
   Tests added to pin down expectations.
   
   There is also a lot of clearing up and refactoring.
   
   Performance checked: the Jena3 and Jena4 times to parse (BSBM data) are the same.
   
   Hopefully, the last large IRI-related PR!
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: pr-unsubscribe@jena.apache.org
For additional commands, e-mail: pr-help@jena.apache.org


[GitHub] [jena] afs merged pull request #940: JENA-2031: Refine IRI settings

Posted by GitBox <gi...@apache.org>.
afs merged pull request #940:
URL: https://github.com/apache/jena/pull/940


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: pr-unsubscribe@jena.apache.org
For additional commands, e-mail: pr-help@jena.apache.org


[GitHub] [jena] kinow commented on a change in pull request #940: JENA-2031: Refine IRI settings

Posted by GitBox <gi...@apache.org>.
kinow commented on a change in pull request #940:
URL: https://github.com/apache/jena/pull/940#discussion_r584954705



##########
File path: jena-arq/src/main/java/org/apache/jena/riot/RDFParserBuilder.java
##########
@@ -291,6 +293,14 @@ public RDFParserBuilder httpClient(HttpClient httpClient) {
      */
     public RDFParserBuilder resolveURIs(boolean flag) { this.resolveURIs = flag ; return this; }
 
+    /**
+     * Provide a specific {@link IRIxResolver} to check and resolve URIs. It's

Review comment:
       s/It's/Its

##########
File path: jena-arq/src/main/java/org/apache/jena/riot/system/Checker.java
##########
@@ -18,161 +18,363 @@
 
 package org.apache.jena.riot.system;
 
+import java.util.Iterator;
+import java.util.Objects;
+import java.util.regex.Pattern;
 
-import org.apache.jena.graph.Node ;
-import org.apache.jena.graph.Triple ;
-import org.apache.jena.riot.checker.* ;
+import org.apache.jena.JenaRuntime;
+import org.apache.jena.datatypes.RDFDatatype;
+import org.apache.jena.graph.Node;
+import org.apache.jena.graph.Node_Triple;
+import org.apache.jena.graph.Triple;
+import org.apache.jena.iri.IRI;
+import org.apache.jena.iri.IRIComponents;
+import org.apache.jena.iri.Violation;
+import org.apache.jena.irix.IRIs;
+import org.apache.jena.irix.SetupJenaIRI;
 import org.apache.jena.sparql.core.Quad;
+import org.apache.jena.sparql.graph.NodeConst;
+import org.apache.jena.util.SplitIRI;
 
-/** A checker that drives the process of validating RDF terms, triples and quads. */
-public final class Checker
-{
-    private boolean allowRelativeIRIs = false ;
-    private boolean warningsAreErrors = false ;
-    private ErrorHandler handler ;
+/**
+ * Functions for checking nodes, triples and quads.
+ * <p>
+ * If the errorHandler is null, use the system wide handler.
+ * <p>
+ * If the errorHandler line/columns numbers are -1, -1, messages do not include them.
+ * <p>
+ * Operations "<tt>checkXXX(<i>item</i>)</tt>" are for boolean testing
+ * and do not generate output.
+ */
 
-    private NodeChecker checkLiterals ;
-    private NodeChecker checkURIs ;
-    private NodeChecker checkBlankNodes ;
-    private NodeChecker checkVars ;
+public class Checker {
 
-    public Checker() {
-        this(null);
+    /** A node -- must be concrete node or a variable. */
+    public static boolean check(Node node) {
+        return check(node, nullErrorHandler, -1, -1);
     }
 
-    public Checker(ErrorHandler handler) {
-        if ( handler == null )
-            handler = ErrorHandlerFactory.getDefaultErrorHandler();
-        this.handler = handler;
+    /** A node -- must be a concrete node or a variable. */
+    public static boolean check(Node node, ErrorHandler errorHandler, long line, long col) {
+        if ( node.isURI() )
+            return checkIRI(node, errorHandler, line, col);
+        else if ( node.isBlank() )
+            return checkBlankNode(node, errorHandler, line, col);
+        else if ( node.isLiteral() )
+            return checkLiteral(node, errorHandler, line, col);
+        else if ( node.isVariable() )
+            return checkVar(node, errorHandler, line, col);
+        else if ( node.isNodeTriple() ) {
+            Triple t = Node_Triple.triple(node);
+            return check(t.getSubject()) && check(t.getPredicate()) && check(t.getObject())
+                    && checkTriple(t);
+        }
+        errorHandler(errorHandler).warning("Not a recognized node: ", line, col);
+        return false;
+    }
 
-        checkLiterals = new CheckerLiterals(handler);
+    // ==== IRIs
 
-        checkURIs = new CheckerIRI(handler);
-        checkBlankNodes = new CheckerBlankNodes(handler);
-        checkVars = new CheckerVar(handler);
+    public static boolean checkIRI(Node node) {
+        return checkIRI(node, nullErrorHandler, -1, -1);
     }
 
-    public boolean check(Node node, long line, long col) {
-        // NodeVisitor?
-        if      ( node.isURI() )        return checkIRI(node, line, col) ;
-        else if ( node.isBlank() )      return checkBlank(node, line, col) ;
-        else if ( node.isLiteral() )    return checkLiteral(node, line, col) ;
-        else if ( node.isVariable() )   return checkVar(node, line, col) ;
-        handler.warning("Not a recognized node: ", line, col) ;
-        return false ;
+    public static boolean checkIRI(Node node, ErrorHandler errorHandler, long line, long col) {
+        if ( !node.isURI() ) {
+            errorHandler(errorHandler).error("Not a URI: " + node, line, col);
+            return false;
+        }
+        return checkIRI(node.getURI(), errorHandler, -1, -1);
     }
 
-    /** Check a triple - assumes individual nodes are legal */
-    public boolean check(Triple triple, long line, long col) {
-        return checkTriple(triple.getSubject(), triple.getPredicate(), triple.getObject(), line, col);
+    public static boolean checkIRI(String iriStr) {
+        return checkIRI(iriStr, nullErrorHandler, -1, -1);
     }
 
-    /** Check a triple against the RDF rules for a triple : subject is a IRI or bnode, predicate is a IRI and object is an bnode, literal or IRI */
-    public boolean checkTriple(Node subject, Node predicate, Node object, long line, long col) {
-        boolean rc = true;
+    /** See also {@link IRIs#reference} */
+    public static boolean checkIRI(String iriStr, ErrorHandler errorHandler, long line, long col) {
+        IRI iri = SetupJenaIRI.iriCheckerFactory().create(iriStr);
+        boolean b = iriViolations(iri, errorHandler, line, col);
+        return b;
+    }
 
-        if ( subject == null || (!subject.isURI() && !subject.isBlank()) ) {
-            handler.error("Subject is not a URI or blank node", line, col);
-            rc = false;
-        }
-        if ( predicate == null || (!predicate.isURI()) ) {
-            handler.error("Predicate not a URI", line, col);
-            rc = false;
+    /**
+     * Process violations on an IRI Calls the {@link ErrorHandler} on all errors and
+     * warnings (as warnings).
+     */
+    public static void iriViolations(IRI iri) {
+        iriViolations(iri, null, false, true, -1L, -1L);
+    }
+
+    /**
+     * Process violations on an IRI Calls the {@link ErrorHandler} on all errors and
+     * warnings (as warnings).
+     */
+    public static boolean iriViolations(IRI iri, ErrorHandler errorHandler, long line, long col) {
+        return iriViolations(iri, errorHandler, false, true, line, col);
+    }
+
+    /**
+     * Process violations on an IRI Calls the errorHandler on all errors and warnings
+     * (as warning). (If checking for relative IRIs, these are sent out as errors.)
+     * Assumes error handler throws exceptions on errors if need be
+     */
+    public static boolean iriViolations(IRI iri, ErrorHandler errorHandler,
+                                        boolean allowRelativeIRIs, boolean includeIRIwarnings,
+                                        long line, long col) {
+        if ( !allowRelativeIRIs && iri.isRelative() )
+            errorHandler(errorHandler).error("Relative IRI: " + iri, line, col);
+
+        boolean isOK = true;
+
+        if ( iri.hasViolation(includeIRIwarnings) ) {
+            Iterator<Violation> iter = iri.violations(includeIRIwarnings);
+
+            for ( ; iter.hasNext() ; ) {
+                Violation v = iter.next();
+                int code = v.getViolationCode();
+                boolean isError = v.isError();
+
+                // Anything we want to reprioritise?
+                if ( code == Violation.LOWERCASE_PREFERRED && v.getComponent() != IRIComponents.SCHEME ) {
+                    // Issue warning about the scheme part. Not e.g. DNS names.
+                    continue;
+                }
+                String msg = v.getShortMessage();
+                String iriStr = iri.toString();
+
+                errorHandler(errorHandler).warning("Bad IRI: " + msg, line, col);
+
+//                if ( isError )
+//                    errorHandler(errorHandler).warning("Bad IRI: " + msg, line, col);
+//                else
+//                    errorHandler(errorHandler).warning("Not advised IRI: " + msg, line, col);
+                isOK = true;
+            }
         }
-        if ( object == null || (!object.isURI() && !object.isBlank() && !object.isLiteral()) ) {
-            handler.error("Object is not a URI, blank node or literal", line, col);
-            rc = false;
+        return isOK;
+    }
+
+    // ==== Literals
+
+    final static private Pattern langPattern = Pattern.compile("[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*");
+
+    public static boolean checkLiteral(Node node) {
+        return checkLiteral(node, nullErrorHandler, -1, -1);
+    }
+
+    public static boolean checkLiteral(Node node, ErrorHandler errorHandler, long line, long col) {
+        if ( !node.isLiteral() ) {
+            errorHandler(errorHandler).error("Not a literal: " + node, line, col);
+            return false;
         }
-        return rc;
+
+        return checkLiteral(node.getLiteralLexicalForm(), node.getLiteralLanguage(), node.getLiteralDatatype(), errorHandler, line, col);
     }
 
-    /** Check a quad - assumes individual nodes are legal */
-    public boolean checkQuad(Quad quad, long line, long col) {
-        return checkQuad(quad.getGraph(), quad.getSubject(), quad.getPredicate(), quad.getObject(), line, col);
+    public static boolean checkLiteral(String lexicalForm, RDFDatatype datatype, ErrorHandler errorHandler, long line, long col) {
+        return checkLiteral(lexicalForm, null, datatype, errorHandler, line, col);
     }
 
-    /** Check a quad against the RDF rules for a quad : subject is a IRI or bnode, predicate is a IRI and object is an bnode, literal or IRI */
-    public boolean checkQuad(Node graph, Node subject, Node predicate, Node object, long line, long col) {
-        boolean rc = true;
+    public static boolean checkLiteral(String lexicalForm, String lang, ErrorHandler errorHandler, long line, long col) {
+        return checkLiteral(lexicalForm, lang, null, errorHandler, line, col);
+    }
 
-        if ( graph == null || (!graph.isURI() && !graph.isBlank()) ) {
-            handler.error("Graph name is not a URI or blank node", line, col);
-            rc = false;
+    public static boolean checkLiteral(String lexicalForm, String lang, RDFDatatype datatype, ErrorHandler errorHandler, long line,
+                                       long col) {
+        boolean hasLang = lang != null && !lang.equals("");
+        if ( !hasLang ) {
+            // Datatype check (and RDF 1.0 simple literals are always well formed)
+            if ( datatype != null )
+                return validateByDatatype(lexicalForm, datatype, errorHandler, line, col);
+            return true;
         }
 
-        if ( subject == null || (!subject.isURI() && !subject.isBlank() && !subject.isNodeTriple() ) ) {
-            handler.error("Subject is not a URI, blank node or RDF-star triple term", line, col);
-            rc = false;
+        // Has a language.
+        if ( JenaRuntime.isRDF11 ) {
+            if ( datatype != null && !Objects.equals(datatype.getURI(), NodeConst.rdfLangString.getURI()) ) {
+                errorHandler(errorHandler).error("Literal has language but wrong datatype", line, col);
+                return false;
+            }
+        } else {
+            if ( datatype != null ) {
+                errorHandler(errorHandler).error("Literal has datatype and language", line, col);
+                return false;
+            }
         }
-        if ( predicate == null || (!predicate.isURI()) ) {
-            handler.error("Predicate not a URI", line, col);
-            rc = false;
+
+        // Test language tag format -- not a perfect test.
+        if ( !lang.isEmpty() && !langPattern.matcher(lang).matches() ) {
+            errorHandler(errorHandler).warning("Language not valid: " + lang, line, col);
+            return false;
         }
-        if ( object == null || (!object.isURI() && !object.isBlank() && !object.isLiteral() && !subject.isNodeTriple() ) ) {
-            handler.error("Object is not a URI, blank node, literal or RDF-star triple term", line, col);
-            rc = false;
+        return true;
+    }
+
+    // Whitespace.
+    // XSD allows whitespace before and after the lexical forms of a literal but not
+    // insiode.

Review comment:
       s/insiode/inside

##########
File path: jena-core/src/main/java/org/apache/jena/irix/SystemIRIx.java
##########
@@ -68,15 +89,24 @@ private static IRIx establishBaseURI() {
             String baseStr = IRILib.filenameToIRI("./");
             if ( ! baseStr.endsWith("/") )
                 baseStr = baseStr+"/";
-            return setSystemBase(baseStr);
+            return setupBase(baseStr);
         } catch (Throwable ex) {
             ex.printStackTrace();
             // e.g. No filesystem.
-            return IRIx.create("urn:base:");
+            return IRIx.create(fallbackBaseURI);
         }
     }
 
-    private static IRIx setSystemBase(String baseStr) {
+    /**
+     * Create an {@link IRIx} suitable for a system base.
+     * This oepration always returns an {@link IRIx}

Review comment:
       s/oepration/operation




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: pr-unsubscribe@jena.apache.org
For additional commands, e-mail: pr-help@jena.apache.org