You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/08/03 21:13:15 UTC
any23 git commit: ANY23-382 don't kill extraction on fatal json
parsing errors
Repository: any23
Updated Branches:
refs/heads/master 817e744af -> 837f92b91
ANY23-382 don't kill extraction on fatal json parsing errors
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/837f92b9
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/837f92b9
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/837f92b9
Branch: refs/heads/master
Commit: 837f92b9167d7944dbc88a965d6e17cf22b375e0
Parents: 817e744
Author: Hans <fi...@gmail.com>
Authored: Fri Aug 3 16:06:15 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Fri Aug 3 16:06:15 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/rdf/BaseRDFExtractor.java | 197 +++++--------------
.../any23/extractor/rdf/JSONLDExtractor.java | 27 +++
.../any23/extractor/rdf/RDFParserFactory.java | 2 +-
.../html/EmbeddedJSONLDExtractorTest.java | 10 +-
.../resources/html/html-jsonld-fatal-error.html | 61 ++++++
5 files changed, 151 insertions(+), 146 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index c0994bd..0e32efc 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -17,11 +17,14 @@
package org.apache.any23.extractor.rdf;
+import com.fasterxml.jackson.core.JsonLocation;
+import com.fasterxml.jackson.core.JsonParseException;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
+import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.html.JsoupUtils;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFParseException;
@@ -197,7 +200,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
} catch (RDFHandlerException ex) {
throw new IllegalStateException("Unexpected exception.", ex);
} catch (RDFParseException ex) {
- throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
+ Throwable cause = ex.getCause();
+ if (cause instanceof JsonParseException) {
+ JsonParseException err = (JsonParseException)cause;
+ JsonLocation loc = err.getLocation();
+ if (loc == null) {
+ extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), -1L, -1L);
+ } else {
+ extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), loc.getLineNr(), loc.getColumnNr());
+ }
+ } else {
+ throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult);
+ }
}
}
@@ -205,7 +219,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
private static class JsonCleaningInputStream extends InputStream {
private boolean inEscape;
- private boolean inQuote;
+ private int quoteChar;
private boolean inCDATA;
private boolean needsComma;
@@ -240,13 +254,37 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
for (;;) {
int c = stream.read();
- if (inQuote) {
- return readQuoted(c, stream);
+ //other types of comments are handled by enabling fasterxml's
+ //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features
+ if (inCDATA) {
+ if (c == ']' && isNextOrUnread(stream, ']', '>')) {
+ inCDATA = false;
+ continue;
+ }
+ } else {
+ if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
+ inCDATA = true;
+ continue;
+ }
}
- //we're not in a quote
- c = stripComments(c, stream);
+ int q = quoteChar;
+ if (q != 0) {
+ //we're in a quote
+ if (inEscape) {
+ //end escape
+ inEscape = false;
+ } else if (c == '\\') {
+ //begin escape
+ inEscape = true;
+ } else if (c == q) {
+ //end quote
+ quoteChar = 0;
+ }
+ return c;
+ }
+ //we're not in a quote
switch (c) {
case ',':
case ';':
@@ -258,150 +296,21 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
//discard comma at end of object or array
needsComma = false;
return c;
- case -1:
- return c;
- default:
- if (Character.isWhitespace(c)) {
- return ' ';
- } else if (needsComma) {
- stream.unread(c);
- stream.unread(' ');
- needsComma = false;
- return ',';
- } else if (c == '"') {
- inQuote = true;
- }
- return c;
- }
- }
-
- }
-
- private int readQuoted(int c, PushbackInputStream stream) throws IOException {
- if (inEscape) {
- switch (c) {
- case 'u':
- //TODO: validate that 'u' is followed by 4 hex chars?
- case '"':
- case '\\':
- case '/':
- case 'b':
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case -1:
- inEscape = false;
- return c;
default:
- stream.unread(c);
- inEscape = false;
- return '\\';
- }
- } else {
- switch (c) {
- case '\\':
- break;
- case '\n':
- stream.unread('n');
- break;
- case '\r':
- stream.unread('r');
- break;
- case '\b':
- stream.unread('b');
- break;
- case '\f':
- stream.unread('f');
- break;
- case '\t':
- stream.unread('t');
- break;
- case '"':
- inQuote = false;
- return c;
- case -1:
- return c;
- default:
- if (c < 0x20 || c == 0x7f) {
- String hex = Integer.toHexString(c);
- int ind = hex.length() - 1;
- stream.unread(hex.charAt(ind));
- stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
- stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
- stream.unread(ind == 0 ? '0' : hex.charAt(--ind));
- stream.unread('u');
- break;
- } else {
- return c;
- }
- }
- inEscape = true;
- return '\\';
- }
- }
-
- private int stripComments(int c, PushbackInputStream stream) throws IOException {
- switch (c) {
- case '/':
- if (isNextOrUnread(stream, '/')) {
- //single line comment: read to end of line
- for (;;) {
- c = stream.read();
- if (c == -1 || c == '\r' || c == '\n') {
- return c;
+ if (c != -1 && !Character.isWhitespace(c)) {
+ if (needsComma) {
+ stream.unread(c);
+ stream.unread(' ');
+ needsComma = false;
+ return ',';
+ } else if (c == '"' || c == '\'') {
+ quoteChar = c;
}
}
- } else if (isNextOrUnread(stream,'*')) {
- //multiline comment: read till next "*/"
- for (;;) {
- c = stream.read();
- if (c == -1) {
- return c;
- } else if (c == '*') {
- c = stream.read();
- if (c == -1) {
- return c;
- } else if (c == '/') {
- //replace entire comment with single space
- return ' ';
- }
- }
- }
- } else {
- return c;
- }
- case '<':
- if (isNextOrUnread(stream,'!','[','C','D','A','T','A','[')) {
- inCDATA = true;
- return ' ';
- } else {
- return c;
- }
- case '#':
- for (;;) {
- c = stream.read();
- if (c == -1 || c == '\r' || c == '\n') {
- return c;
- }
- }
- case ']':
- if (inCDATA) {
- if (isNextOrUnread(stream, ']', '>')) {
- inCDATA = false;
- return ' ';
- } else {
- return c;
- }
- } else {
return c;
- }
- default:
- return c;
+ }
}
-
}
-
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
index 402e267..71f2459 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
@@ -17,12 +17,16 @@
package org.apache.any23.extractor.rdf;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
import com.github.jsonldjava.utils.JsonUtils;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.eclipse.rdf4j.rio.RDFParser;
+import java.lang.reflect.Field;
+
/**
* Concrete implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor}
* handling <a href="http://www.w3.org/TR/json-ld/">JSON-LD</a> format.
@@ -41,6 +45,29 @@ public class JSONLDExtractor extends BaseRDFExtractor {
throw new AssertionError("You have an outdated version of jsonld-java on the classpath. " +
"Upgrade to at least version 0.12.0. See: https://issues.apache.org/jira/browse/ANY23-336", th);
}
+
+ JsonFactory JSON_FACTORY;
+ try {
+ Field field = JsonUtils.class.getDeclaredField("JSON_FACTORY");
+ field.setAccessible(true);
+ JSON_FACTORY = (JsonFactory)field.get(null);
+ } catch (Exception e) {
+ throw new AssertionError(e);
+ }
+
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS);
+ JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS);
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS);
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES);
+ JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES);
+ JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS);
+ JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED);
+ JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION);
+ JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION);
}
http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
index 2778621..6b4406a 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
@@ -284,7 +284,7 @@ public class RDFParserFactory {
) {
parser.getParserConfig().setNonFatalErrors(stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
- parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
+ parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
parser.setParseErrorListener(new InternalParseErrorListener(extractionResult));
parser.setValueFactory(
http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
index 41a0711..4141bd2 100644
--- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
@@ -17,6 +17,7 @@
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.IssueReport;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.FOAF;
import org.junit.Test;
@@ -75,13 +76,20 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase {
assertExtract("/html/html-jsonld-unescaped-characters.html");
assertModelNotEmpty();
assertStatementsSize(null, null, null, 375);
- assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\\\u0008");
+ assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\u0008");
assertContains(RDFUtils.iri("http://schema.org/description"),
"#1 MAGIC SHOW IN L.A.\nThe current WINNER of the CW’s Penn & Teller’s FOOL US, Illusionist " +
"extraordinaire Ivan Amodei is on a national tour with his show INTIMATE ILLUSIONS." +
"\n\nCurrently, on an ei...");
}
+ @Test
+ public void testJSONLDFatalError() {
+ assertExtract("/html/html-jsonld-fatal-error.html",false);
+ assertIssue(IssueReport.IssueLevel.FATAL, ".*Unexpected character .* was expecting comma to separate Object entries.*");
+ assertStatementsSize(null, null, null, 4);
+ }
+
@Override
protected ExtractorFactory<?> getExtractorFactory() {
return new EmbeddedJSONLDExtractorFactory();
http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/test-resources/src/test/resources/html/html-jsonld-fatal-error.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/html/html-jsonld-fatal-error.html b/test-resources/src/test/resources/html/html-jsonld-fatal-error.html
new file mode 100644
index 0000000..1ccb7ab
--- /dev/null
+++ b/test-resources/src/test/resources/html/html-jsonld-fatal-error.html
@@ -0,0 +1,61 @@
+<!DOCTYPE html>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> <!-- Excerpted from: http://osl.ugr.es/JSLUGR/ -->
+<html lang="es">
+
+<head>
+ <title>Jornadas de Software Libre de la Universidad de Granada</title>
+</head>
+
+<body id="page-top" data-spy="scroll" data-target=".navbar-fixed-top">
+
+
+
+<script type="application/ld+json">
+ {
+ "@context": "http://schema.org",
+ "@type": "Organization",
+ "url": "http://osl.ugr.es",
+ "contactPoint": [{
+ "@type": "ContactPoint",
+ "email": "osl@ugr.es",
+ "name": "Jornadas de Software Libre"
+ "contactType": "Organizing committee",
+ "url": "http://osl.ugr.es"
+ }]
+ }
+ </script>
+
+<script type="application/ld+json">
+ {
+ "@context": {
+ "ical": "http://www.w3.org/2002/12/cal/ical#",
+ "xsd": "http://www.w3.org/2001/XMLSchema#",
+ "ical:dtstart": {
+ "@type": "xsd:dateTime"
+ }
+ },
+ "ical:summary": "Jornadas de Software Libre",
+ "ical:location": "Por determinar. Granada, España",
+ "ical:dtstart": "2017-09-27T08:00Z",
+ "ical:dtend": "2017-09-28T16:00Z"
+ }
+ </script>
+
+</body>
+
+</html>
\ No newline at end of file