You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/08/06 19:33:33 UTC
any23 git commit: improve JsonCleaningInputStream
Repository: any23
Updated Branches:
refs/heads/master 22b3047d5 -> e046f7329
improve JsonCleaningInputStream
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/e046f732
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/e046f732
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/e046f732
Branch: refs/heads/master
Commit: e046f7329538b61f17225e64f79c280c4d248aa9
Parents: 22b3047
Author: Hans <fi...@gmail.com>
Authored: Mon Aug 6 14:31:08 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Mon Aug 6 14:31:08 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/rdf/BaseRDFExtractor.java | 147 ----------
.../any23/extractor/rdf/JSONLDExtractor.java | 6 +-
.../extractor/rdf/JsonCleaningInputStream.java | 271 +++++++++++++++++++
.../test/java/org/apache/any23/Any23Test.java | 2 +-
.../extractor/rdf/JSONLDExtractorTest.java | 2 +-
5 files changed, 276 insertions(+), 152 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index ea582cb..796bada 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -46,7 +46,6 @@ import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
@@ -215,150 +214,4 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
}
}
-
- static class JsonCleaningInputStream extends InputStream {
-
- private boolean inEscape;
- private int quoteChar;
- private boolean inCDATA;
- private boolean needsComma;
-
- private final PushbackInputStream wrapped;
-
- JsonCleaningInputStream(InputStream in) {
- wrapped = new PushbackInputStream(in, 16);
- }
-
- private static boolean isNextOrUnread(PushbackInputStream stream, int... next) throws IOException {
- int i = -1;
- for (int test : next) {
- int c = stream.read();
- if (c != test) {
- if (c != -1) {
- stream.unread(c);
- }
- while (i >= 0) {
- stream.unread(next[i--]);
- }
- return false;
- }
- i++;
- }
- return true;
- }
-
- @Override
- public int read() throws IOException {
- PushbackInputStream stream = wrapped;
-
- for (;;) {
- int c = stream.read();
-
- //other types of comments are handled by enabling fasterxml's
- //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features
- if (inCDATA) {
- if (c == ']' && isNextOrUnread(stream, ']', '>')) {
- inCDATA = false;
- continue;
- }
- } else {
- if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
- inCDATA = true;
- continue;
- }
- }
-
- int q = quoteChar;
- if (q != 0) {
- //we're in a quote
- if (inEscape) {
- //end escape
- inEscape = false;
- } else if (c == '\\') {
- //begin escape
- inEscape = true;
- } else if (c == q) {
- //end quote
- quoteChar = 0;
- }
- return c;
- }
-
- //we're not in a quote
- switch (c) {
- case ',':
- case ';':
- //don't write out comma yet!
- needsComma = true;
- continue;
- case '}':
- case ']':
- //discard comma at end of object or array
- needsComma = false;
- return c;
- case -1:
- case '\r':
- case '\n':
- return c;
- case 0x09:
- case 0x0b:
- case 0x0c:
- case 0x1c:
- case 0x1d:
- case 0x1e:
- case 0x1f:
- case 0x20:
- return ' ';
- case 0xc2:
- if (isNextOrUnread(stream, 0xa0)) {
- return ' ';
- }
- break;
- case 0xe1:
- if (isNextOrUnread(stream, 0x9a, 0x80)
- || isNextOrUnread(stream, 0xa0, 0x8e)) {
- return ' ';
- }
- break;
- case 0xe2:
- int c1 = stream.read();
- if (c1 == 0x80) {
- int c2 = stream.read();
- //space separators
- if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
- //line and paragraph separators
- || c2 == 0xa8 || c2 == 0xa9) {
- return ' ';
- }
- stream.unread(c2);
- } else if (c1 == 0x81) {
- int c2 = stream.read();
- if (c2 == 0x9f) {
- return ' ';
- }
- stream.unread(c2);
- }
- stream.unread(c1);
- break;
- case 0xe3:
- if (isNextOrUnread(stream, 0x80, 0x80)) {
- return ' ';
- }
- break;
- default:
- break;
- }
- if (needsComma) {
- stream.unread(c);
- stream.unread(' ');
- needsComma = false;
- return ',';
- } else if (c == '"' || c == '\'') {
- quoteChar = c;
- }
- return c;
- }
- }
- }
-
}
http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
index 71f2459..1806adf 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java
@@ -56,15 +56,15 @@ public class JSONLDExtractor extends BaseRDFExtractor {
}
JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER);
- JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS);
+ JSON_FACTORY.disable(JsonParser.Feature.ALLOW_COMMENTS); //handled by JsonCleaningInputStream
JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream
JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS);
JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS);
- JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES);
+ JSON_FACTORY.disable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); //handled by JsonCleaningInputStream
JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream
JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS);
JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES);
- JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS);
+ JSON_FACTORY.disable(JsonParser.Feature.ALLOW_YAML_COMMENTS); //handled by JsonCleaningInputStream
JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED);
JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION);
JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION);
http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java b/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java
new file mode 100644
index 0000000..bda229e
--- /dev/null
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.rdf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+/**
+ * This class uses several strategies to fix common JSON syntax errors, including:
+ * <ol>
+ * <li>Remove CDATA markers</li>
+ * <li>Remove YAML and C-style comments</li>
+ * <li>Allow single-quoted strings</li>
+ * <li>Ignore duplicated commas between elements of objects and arrays</li>
+ * <li>Remove trailing commas from objects and arrays</li>
+ * <li>Insert omitted commas after objects and arrays</li>
+ * <li>Ignore all unicode whitespace characters (assumes UTF-8 encoding)</li>
+ * <li>Treat semi-colons as commas</li>
+ * </ol>
+ *
+ * @author Hans Brende (hansbrende@apache.org)
+ */
+class JsonCleaningInputStream extends InputStream {
+
+ private static final int EOL_COMMENT = 1;
+ private static final int MULTILINE_COMMENT = 2;
+
+ private static final int NEEDS_COMMA = 1;
+ private static final int NEEDS_COMMA_AND_NEWLINE = 2;
+
+ private boolean inEscape;
+ private boolean inCDATA;
+ private int needsComma;
+ private int currentState;
+
+ private final PushbackInputStream in;
+
+ JsonCleaningInputStream(InputStream in) {
+ this.in = new PushbackInputStream(in, 16);
+ }
+
+ private static void unread(PushbackInputStream in, int c) throws IOException {
+ if (c != -1) {
+ in.unread(c);
+ }
+ }
+
+ private static boolean isNextOrUnread(PushbackInputStream in, int... next) throws IOException {
+ int i = -1;
+ for (int test : next) {
+ int c = in.read();
+ if (c != test) {
+ unread(in, c);
+ while (i >= 0) {
+ in.unread(next[i--]);
+ }
+ return false;
+ }
+ i++;
+ }
+ return true;
+ }
+
+ @Override
+ public int read() throws IOException {
+ PushbackInputStream in = this.in;
+
+ for (;;) {
+ int c = in.read();
+
+ if (c == -1) {
+ return c;
+ }
+
+ if (inCDATA) {
+ if (c == ']' && isNextOrUnread(in, ']', '>')) {
+ inCDATA = false;
+ continue;
+ }
+ } else {
+ if (c == '<' && isNextOrUnread(in, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
+ inCDATA = true;
+ continue;
+ }
+ }
+
+ int ctx = currentState;
+ switch (ctx) {
+ case 0:
+ break;
+ case EOL_COMMENT:
+ if (c == '\r' || c == '\n') {
+ //end single-line comment
+ currentState = 0;
+ if (needsComma != 0) {
+ needsComma = NEEDS_COMMA_AND_NEWLINE;
+ continue;
+ }
+ return c;
+ }
+ continue;
+ case MULTILINE_COMMENT:
+ if (c == '\r' || c == '\n') {
+ if (needsComma != 0) {
+ needsComma = NEEDS_COMMA_AND_NEWLINE;
+ continue;
+ }
+ return c;
+ } else if (c == '*' && isNextOrUnread(in, '/')) {
+ //end multiline comment
+ currentState = 0;
+ }
+ continue;
+ default:
+ //we're in a quote
+ if (inEscape) {
+ //end escape
+ inEscape = false;
+ } else if (c == '\\') {
+ //begin escape
+ inEscape = true;
+ } else if (c == ctx) {
+ //end quote
+ currentState = 0;
+ return '"';
+ }
+ return c;
+ }
+
+ $whitespace: {
+ //we're not in a quote
+ switch (c) {
+ case '#':
+ currentState = EOL_COMMENT;
+ continue;
+ case '/':
+ int next = in.read();
+ if (next == '/') {
+ currentState = EOL_COMMENT;
+ continue;
+ } else if (next == '*') {
+ currentState = MULTILINE_COMMENT;
+ continue;
+ }
+ unread(in, next);
+ break;
+ case ',':
+ case ';':
+ //don't write out comma yet!
+ needsComma = NEEDS_COMMA;
+ continue;
+ case '}':
+ case ']':
+ // Only thing that can follow '}' or ']' is:
+ // '}' or ']' or ',' or EOF
+ needsComma = NEEDS_COMMA;
+ return c;
+ case '\r':
+ case '\n':
+ if (needsComma != 0) {
+ needsComma = NEEDS_COMMA_AND_NEWLINE;
+ continue;
+ }
+ return c;
+ // UTF-8 whitespace detection
+ case 0x09:
+ case 0x0b:
+ case 0x0c:
+ case 0x1c:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ case 0x20:
+ break $whitespace;
+ case 0xc2:
+ if (isNextOrUnread(in, 0xa0)) {
+ break $whitespace;
+ }
+ break;
+ case 0xe1:
+ if (isNextOrUnread(in, 0x9a, 0x80)
+ || isNextOrUnread(in, 0xa0, 0x8e)) {
+ break $whitespace;
+ }
+ break;
+ case 0xe2:
+ int c1 = in.read();
+ if (c1 == 0x80) {
+ int c2 = in.read();
+ //space separators
+ if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
+ //line and paragraph separators
+ || c2 == 0xa8 || c2 == 0xa9) {
+ break $whitespace;
+ }
+ unread(in, c2);
+ in.unread(0x80);
+ } else if (c1 == 0x81) {
+ int c2 = in.read();
+ if (c2 == 0x9f) {
+ break $whitespace;
+ }
+ unread(in, c2);
+ in.unread(0x81);
+ } else {
+ unread(in, c1);
+ }
+ break;
+ case 0xe3:
+ if (isNextOrUnread(in, 0x80, 0x80)) {
+ break $whitespace;
+ }
+ break;
+ default:
+ break;
+ }
+
+ //here: character is not whitespace
+
+ int nc = needsComma;
+ if (nc != 0) {
+ in.unread(c);
+ if (nc == NEEDS_COMMA) {
+ in.unread(' ');
+ } else {
+ for (int i = NEEDS_COMMA_AND_NEWLINE; i < nc; i++) {
+ in.unread(' ');
+ }
+ in.unread('\n');
+ }
+ needsComma = 0;
+ return ',';
+ } else if (c == '"' || c == '\'') {
+ currentState = c;
+ return '"';
+ }
+ return c;
+ } //end $whitespace
+
+ //here: character is whitespace
+
+ int nc = needsComma;
+ if (nc != 0) {
+ if (nc != NEEDS_COMMA) {
+ needsComma = (nc + 1) & 0xFF;
+ }
+ continue;
+ }
+
+ return ' ';
+
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/test/java/org/apache/any23/Any23Test.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java
index 085db04..d1d3467 100644
--- a/core/src/test/java/org/apache/any23/Any23Test.java
+++ b/core/src/test/java/org/apache/any23/Any23Test.java
@@ -345,7 +345,7 @@ public class Any23Test extends Any23OnlineTestBase {
} finally {
compositeTH1.close();
}
- logger.info(baos.toString());
+ logger.debug(baos.toString());
Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
cth1.getCount());
http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
index 215b552..f1338b4 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
@@ -71,7 +71,7 @@ public class JSONLDExtractorTest {
for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
- InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new ByteArrayInputStream(bytes));
+ InputStream stream = new JsonCleaningInputStream(new ByteArrayInputStream(bytes));
if (i == '\r' || i == '\n') {
Assert.assertEquals(stream.read(), i);
} else {