You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/04/02 17:06:58 UTC
[2/2] any23 git commit: ANY23-340 Removes doctypes to allow
extraction of additional rdfa 1.1 triples
ANY23-340 Removes doctypes to allow extraction of additional rdfa 1.1 triples
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/60d6f616
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/60d6f616
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/60d6f616
Branch: refs/heads/master
Commit: 60d6f61644e307def7e6b5e193af2e2d46421b5d
Parents: a1b72b7
Author: Hans <fi...@gmail.com>
Authored: Fri Mar 30 15:03:17 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Fri Mar 30 15:03:17 2018 -0500
----------------------------------------------------------------------
.../any23/extractor/rdf/BaseRDFExtractor.java | 21 +-
.../extractor/rdfa/RDFa11ExtractorTest.java | 10 +
.../test/resources/html/BBC_News_Scotland.html | 3780 ++++++++++++++++++
3 files changed, 3802 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 61b58c1..1882ed9 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -29,12 +29,14 @@ import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
+import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
+import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Entities;
import org.jsoup.nodes.Node;
+import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
-import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -131,17 +133,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
.syntax(Document.OutputSettings.Syntax.xml)
.escapeMode(Entities.EscapeMode.xhtml)
.charset(charset);
- //Delete scripts. Json-ld in script tags is extracted first
- //from tag soup dom, so we should be fine.
- NodeTraversor.traverse(new NodeVisitor() {
+ // Delete scripts, comments, and doctypes
+ // See https://issues.apache.org/jira/browse/ANY23-317
+ // and https://issues.apache.org/jira/browse/ANY23-340
+ NodeTraversor.filter(new NodeFilter() {
@Override
- public void head(Node node, int depth) {
- if (node instanceof DataNode) {
- ((DataNode) node).setWholeData("");
- }
+ public FilterResult head(Node node, int depth) {
+ return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType
+ ? FilterResult.REMOVE : FilterResult.CONTINUE;
}
@Override
- public void tail(Node node, int depth) {
+ public FilterResult tail(Node node, int depth) {
+ return FilterResult.CONTINUE;
}
}, doc);
http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
index 0599aaf..c0767c9 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
@@ -61,6 +61,16 @@ public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase {
}
@Test
+ public void testBBCNewsScotland() {
+ assertExtract("/html/BBC_News_Scotland.html");
+ assertModelNotEmpty();
+ assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1);
+ assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1);
+ assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1);
+ assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8);
+ }
+
+ @Test
public void testIssue326() {
assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
}