You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2018/01/24 06:11:47 UTC
[1/2] any23 git commit: ANY23-324 Changed default html parser from
NekoHTML to Jsoup. This also indirectly fixes ANY23-317, ANY23-273, ANY23-267,
and ANY23-326.
Repository: any23
Updated Branches:
refs/heads/master f36c5e162 -> 07f7421cd
ANY23-324 Changed default html parser from NekoHTML to Jsoup. This also indirectly fixes ANY23-317, ANY23-273, ANY23-267, and ANY23-326.
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2c76ada3
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2c76ada3
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2c76ada3
Branch: refs/heads/master
Commit: 2c76ada3bc812c37a46863e0529363f42339582a
Parents: f36c5e1
Author: Hans <fi...@gmail.com>
Authored: Thu Jan 18 15:08:27 2018 -0600
Committer: Hans <fi...@gmail.com>
Committed: Sun Jan 21 16:47:34 2018 -0600
----------------------------------------------------------------------
.../resources/default-configuration.properties | 4 +
core/pom.xml | 4 +
.../extractor/html/EmbeddedJSONLDExtractor.java | 6 +-
.../any23/extractor/html/HCardExtractor.java | 3 +-
.../any23/extractor/html/HTMLMetaExtractor.java | 6 +-
.../any23/extractor/html/TagSoupParser.java | 173 ++++++++------
.../html/TagSoupParsingConfiguration.java | 224 +++++++++++++++++++
.../microdata/MicrodataParserTest.java | 5 +-
pom.xml | 5 +
9 files changed, 352 insertions(+), 78 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index 4f68586..d1d35de 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -76,3 +76,7 @@ any23.extraction.csv.comment=#
# A confidence threshold for the OpenIE extractions
# Any extractions below this value will not be processed.
any23.extraction.openie.confidence.threshold=0.5
+
+# Use legacy setting to parse html
+# with NekoHTML instead of Jsoup
+any23.tagsoup.legacy=off
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index 554845a..59611d4 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -75,6 +75,10 @@
<artifactId>nekohtml</artifactId>
</dependency>
<dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ </dependency>
+ <dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
</dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
index 34728e5..1e6efdf 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -123,8 +123,10 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
for (Node linkNode : linkNodes) {
NamedNodeMap attributes = linkNode.getAttributes();
- String rel = attributes.getNamedItem("rel").getTextContent();
- String href = attributes.getNamedItem("href").getTextContent();
+ Node relNode = attributes.getNamedItem("rel");
+ String rel = relNode == null ? null : relNode.getTextContent();
+ Node hrefNode = attributes.getNamedItem("href");
+ String href = hrefNode == null ? null : hrefNode.getTextContent();
if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
index c1160fa..822a8eb 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
@@ -101,7 +101,8 @@ public class HCardExtractor extends EntityBasedMicroformatExtractor {
report.notifyIssue(
IssueReport.IssueLevel.WARNING,
"Current node tries to include an ancestor node.",
- nodeLocation[0], nodeLocation[1]
+ nodeLocation == null ? -1 : nodeLocation[0],
+ nodeLocation == null ? -1 : nodeLocation[1]
);
continue;
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
index a3c6550..3ca4f50 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
@@ -139,8 +139,10 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
for(Node linkNode : linkNodes) {
NamedNodeMap attributes = linkNode.getAttributes();
- String rel = attributes.getNamedItem("rel").getTextContent();
- String href = attributes.getNamedItem("href").getTextContent();
+ Node relNode = attributes.getNamedItem("rel");
+ String rel = relNode == null ? null : relNode.getTextContent();
+ Node hrefNode = attributes.getNamedItem("href");
+ String href = hrefNode == null ? null : hrefNode.getTextContent();
if(rel != null && href !=null && RDFUtils.isAbsoluteIRI(href)) {
prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index 9ef72f4..2147520 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -17,6 +17,7 @@
package org.apache.any23.extractor.html;
+import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.validator.DefaultValidator;
import org.apache.any23.validator.Validator;
import org.apache.any23.validator.ValidatorException;
@@ -56,6 +57,7 @@ import java.nio.charset.UnsupportedCharsetException;
* @author Michele Mostarda (mostarda@fbk.eu)
* @author Davide Palmisano (palmisano@fbk.eu)
*/
+
public class TagSoupParser {
public static final String ELEMENT_LOCATION = "Element-Location";
@@ -69,24 +71,32 @@ public class TagSoupParser {
private final String documentIRI;
private final String encoding;
-
+
+ private final TagSoupParsingConfiguration config;
+
private Document result = null;
+
public TagSoupParser(InputStream input, String documentIRI) {
this.input = input;
this.documentIRI = documentIRI;
this.encoding = null;
+
+ config = TagSoupParsingConfiguration.getDefault();
}
public TagSoupParser(InputStream input, String documentIRI, String encoding) {
- if(encoding != null && !Charset.isSupported(encoding))
+ if (encoding != null && !Charset.isSupported(encoding))
throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding));
this.input = input;
this.documentIRI = documentIRI;
this.encoding = encoding;
+
+ config = TagSoupParsingConfiguration.getDefault();
}
+
/**
* Returns the DOM of the given document IRI.
*
@@ -97,22 +107,10 @@ public class TagSoupParser {
if (result == null) {
long startTime = System.currentTimeMillis();
try {
- result = parse();
- } catch (SAXException ex) {
- // should not happen, it's a tag soup parser
- throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
- } catch (TransformerException ex) {
- // should not happen, it's a tag soup parser
- throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
- } catch (NullPointerException ex) {
- if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
- throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex);
- } else {
- throw ex;
- }
+ result = config.parse(input, documentIRI, encoding);
} finally {
long elapsed = System.currentTimeMillis() - startTime;
- logger.debug("Parsed " + documentIRI + " with NekoHTML, " + elapsed + "ms");
+ logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
}
}
result.setDocumentURI(documentIRI);
@@ -142,70 +140,103 @@ public class TagSoupParser {
return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
}
- private Document parse() throws IOException, SAXException, TransformerException {
- final DOMParser parser = new DOMParser() {
- private QName currentQName;
- private Augmentations currentAugmentations;
+ static TagSoupParsingConfiguration legacyConfig() {
+ return NekoHTML.instance;
+ }
+
+ private static class NekoHTML extends TagSoupParsingConfiguration {
+
+ private static final NekoHTML instance = new NekoHTML();
- @Override
- protected Element createElementNode(QName qName) {
- final Element created = super.createElementNode(qName);
- if (qName.equals(currentQName) && currentAugmentations != null) {
- final ElementLocation elementLocation = createElementLocation(
- currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
- );
- created.setUserData(ELEMENT_LOCATION, elementLocation, null);
+ @Override
+ Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
+ try {
+ return parse(input, encoding);
+ } catch (SAXException ex) {
+ // should not happen, it's a tag soup parser
+ throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
+ } catch (TransformerException ex) {
+ // should not happen, it's a tag soup parser
+ throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
+ } catch (NullPointerException ex) {
+ if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
+ throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex);
+ } else {
+ throw ex;
}
- return created;
}
+ }
- @Override
- public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
- throws XNIException {
- super.startElement(qName, xmlAttributes, augmentations);
- currentQName = qName;
- currentAugmentations = augmentations;
- }
+ private Document parse(InputStream input, String encoding) throws IOException, SAXException, TransformerException {
+ final DOMParser parser = new DOMParser() {
+
+ private QName currentQName;
+ private Augmentations currentAugmentations;
- private ElementLocation createElementLocation(Object obj) {
- if(obj == null) return null;
- String pattern = null;
- try {
- pattern = obj.toString();
- if( "synthesized".equals(pattern) ) return null;
- final String[] parts = pattern.split(":");
- return new ElementLocation(
- Integer.parseInt(parts[0]),
- Integer.parseInt(parts[1]),
- Integer.parseInt(parts[3]),
- Integer.parseInt(parts[4])
-
- );
- } catch (Exception e) {
- logger.warn(
- String.format("Unexpected string format for given augmentation: [%s]", pattern),
- e
- );
- return null;
+ @Override
+ protected Element createElementNode(QName qName) {
+ final Element created = super.createElementNode(qName);
+ if (qName.equals(currentQName) && currentAugmentations != null) {
+ final ElementLocation elementLocation = createElementLocation(
+ currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
+ );
+ created.setUserData(ELEMENT_LOCATION, elementLocation, null);
+ }
+ return created;
}
- }
- };
- parser.setFeature("http://xml.org/sax/features/namespaces", false);
- parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
- parser.setFeature(AUGMENTATIONS_FEATURE, true);
- if (this.encoding != null)
- parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding);
-
- /*
- * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
- * parser. This will ensure the correct handling of inline HTML SPAN tags.
- * This fix is documented at issue #78.
- */
- parser.parse(new InputSource( new SpanCloserInputStream(input)));
- return parser.getDocument();
+
+ @Override
+ public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
+ throws XNIException {
+ super.startElement(qName, xmlAttributes, augmentations);
+ currentQName = qName;
+ currentAugmentations = augmentations;
+ }
+
+ private ElementLocation createElementLocation(Object obj) {
+ if(obj == null) return null;
+ String pattern = null;
+ try {
+ pattern = obj.toString();
+ if( "synthesized".equals(pattern) ) return null;
+ final String[] parts = pattern.split(":");
+ return new ElementLocation(
+ Integer.parseInt(parts[0]),
+ Integer.parseInt(parts[1]),
+ Integer.parseInt(parts[3]),
+ Integer.parseInt(parts[4])
+
+ );
+ } catch (Exception e) {
+ logger.warn(
+ String.format("Unexpected string format for given augmentation: [%s]", pattern),
+ e
+ );
+ return null;
+ }
+ }
+ };
+ parser.setFeature("http://xml.org/sax/features/namespaces", false);
+ parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
+ parser.setFeature(AUGMENTATIONS_FEATURE, true);
+ if (encoding != null)
+ parser.setProperty("http://cyberneko.org/html/properties/default-encoding", encoding);
+
+ /*
+ * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
+ * parser. This will ensure the correct handling of inline HTML SPAN tags.
+ * This fix is documented at issue #78.
+ */
+ parser.parse(new InputSource( new SpanCloserInputStream(input)));
+ return parser.getDocument();
+ }
+
+
}
+
+
/**
* Describes a <i>DOM Element</i> location.
*/
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
new file mode 100644
index 0000000..1cf2538
--- /dev/null
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -0,0 +1,224 @@
+package org.apache.any23.extractor.html;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.parser.Parser;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.Text;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+import java.util.Arrays;
+
+abstract class TagSoupParsingConfiguration {
+
+ static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";
+
+ String name() {
+ return getClass().getSimpleName();
+ }
+
+ abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;
+
+
+ static TagSoupParsingConfiguration getDefault() {
+ return Default.instance;
+ }
+
+ private static class Default {
+
+ private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton()
+ .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance;
+
+ }
+
+
+ private static class JsoupConfig extends TagSoupParsingConfiguration {
+
+ private static final JsoupConfig instance = new JsoupConfig();
+
+
+ @Override
+ Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
+ //Jsoup doesn't allow null document URIs
+
+ if (documentIRI == null) {
+ documentIRI = "";
+ }
+
+ //workaround for Jsoup issue #1009
+ if (encoding == null) {
+
+ int c;
+ do {
+ c = input.read();
+ } while (c != -1 && Character.isWhitespace(c));
+
+ if (c != -1) {
+ int capacity = 256;
+ byte[] bytes = new byte[capacity];
+ int length = 0;
+ bytes[length++] = (byte)c;
+
+ if (c == '<') {
+ c = input.read();
+ if (c != -1) {
+ bytes[length++] = (byte)c;
+ if (c == '?') {
+ c = input.read();
+
+ while (c != -1) {
+ if (length == capacity) {
+ capacity *= 2;
+ bytes = Arrays.copyOf(bytes, capacity);
+ }
+ bytes[length++] = (byte)c;
+
+ if (c == '>') {
+ if (length >= 20 && bytes[length - 2] == '?') {
+ String decl = "<" + new String(bytes, 2, length - 4) + ">";
+ org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
+ for (org.jsoup.nodes.Element el : doc.children()) {
+ if ("xml".equalsIgnoreCase(el.tagName())) {
+ String enc = el.attr("encoding");
+ if (enc != null && !enc.isEmpty()) {
+ encoding = enc;
+ break;
+ }
+ }
+ }
+ }
+ break;
+ }
+
+ c = input.read();
+ }
+ }
+ }
+
+ }
+
+ input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
+ }
+
+ }
+
+ org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(input, encoding, documentIRI);
+
+ return convert(document);
+ }
+
+
+ private static Document convert(org.jsoup.nodes.Document document) {
+ Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
+
+ for (org.jsoup.nodes.Element rootEl : document.children()) {
+ NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
+ }
+
+ return w3cDoc;
+ }
+
+ private static class DocumentConverter implements NodeVisitor {
+
+ private final Document doc;
+ private org.w3c.dom.Element dest;
+
+ DocumentConverter(Document doc) {
+ this.doc = doc;
+ }
+
+ @Override
+ public void head(org.jsoup.nodes.Node source, int depth) {
+ if (source instanceof org.jsoup.nodes.Element) {
+ org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
+
+ org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
+ copyAttributes(sourceEl, el);
+ if (dest == null) {
+ doc.appendChild(el);
+ } else {
+ dest.appendChild(el);
+ }
+ dest = el;
+ } else if (source instanceof org.jsoup.nodes.TextNode) {
+ org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
+ Text text = doc.createTextNode(sourceText.getWholeText());
+ dest.appendChild(text);
+ } else if (source instanceof org.jsoup.nodes.Comment) {
+ org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
+ Comment comment = doc.createComment(sourceComment.getData());
+ dest.appendChild(comment);
+ } else if (source instanceof org.jsoup.nodes.DataNode) {
+ org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
+ Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
+ dest.appendChild(node);
+ }
+ }
+
+ @Override
+ public void tail(org.jsoup.nodes.Node source, int depth) {
+ if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
+ dest = (org.w3c.dom.Element) dest.getParentNode();
+ }
+ }
+
+ private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
+ for (Attribute attribute : source.attributes()) {
+ // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
+ String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
+ if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
+ el.setAttribute(key, attribute.getValue());
+ }
+ }
+ }
+
+ private static String stripCDATA(String string) {
+ return reduceToContent(string, "<![CDATA[", "]]>");
+ }
+
+ private static String reduceToContent(String string, String startMarker, String endMarker) {
+ int i = 0;
+ int startContent = -1;
+ int l1 = startMarker.length();
+
+ int l2;
+ char c;
+ for(l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
+ c = string.charAt(i);
+ if (!Character.isWhitespace(c)) {
+ if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
+ startContent = i + l1;
+ break;
+ }
+
+ return string;
+ }
+ }
+
+ if (startContent != -1) {
+ for(i = string.length() - 1; i > startContent + l2; --i) {
+ c = string.charAt(i);
+ if (!Character.isWhitespace(c)) {
+ if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
+
+ return string.substring(startContent, i - 2);
+ }
+
+ return string;
+ }
+ }
+
+ }
+ return string;
+ }
+
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
index 4fa237e..c58a92b 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
@@ -275,10 +275,11 @@ public class MicrodataParserTest {
}
for(int i = 0; i < errors.length; i++) {
+ //Jsoup doesn't support element locations
Assert.assertEquals(
"Error while comparing error [" + i + "]",
- resultContent.getProperty("error" + i),
- errors[i].toJSON()
+ resultContent.getProperty("error" + i).replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+", "_col\" : -1"),
+ errors[i].toJSON().replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+", "_col\" : -1")
);
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index dde7581..0b03914 100644
--- a/pom.xml
+++ b/pom.xml
@@ -364,6 +364,11 @@
<artifactId>nekohtml</artifactId>
<version>1.9.20</version>
</dependency>
+ <dependency>
+ <groupId>org.jsoup</groupId>
+ <artifactId>jsoup</artifactId>
+ <version>1.11.2</version>
+ </dependency>
<!-- BEGIN: Tika -->
<dependency>
[2/2] any23 git commit: ANY23-324 Added license to
TagSoupParsingConfiguration
Posted by le...@apache.org.
ANY23-324 Added license to TagSoupParsingConfiguration
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/07f7421c
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/07f7421c
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/07f7421c
Branch: refs/heads/master
Commit: 07f7421cd5288b7440bd1a88c21019503770f760
Parents: 2c76ada
Author: Hans <fi...@gmail.com>
Authored: Tue Jan 23 12:18:18 2018 -0600
Committer: Hans <fi...@gmail.com>
Committed: Tue Jan 23 12:18:18 2018 -0600
----------------------------------------------------------------------
.../html/TagSoupParsingConfiguration.java | 22 ++++++++++++++++++++
1 file changed, 22 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/07f7421c/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
index 1cf2538..77e4524 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.any23.extractor.html;
import org.apache.any23.configuration.DefaultConfiguration;
@@ -15,6 +32,11 @@ import java.io.InputStream;
import java.io.SequenceInputStream;
import java.util.Arrays;
+
+/**
+ * The parsing configuration for a {@link TagSoupParser}
+ * @author Hans Brende
+ */
abstract class TagSoupParsingConfiguration {
static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";