You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by ha...@apache.org on 2018/04/02 17:06:58 UTC

[2/2] any23 git commit: ANY23-340 Removes doctypes to allow extraction of additional rdfa 1.1 triples

ANY23-340 Removes doctypes to allow extraction of additional rdfa 1.1 triples


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/60d6f616
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/60d6f616
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/60d6f616

Branch: refs/heads/master
Commit: 60d6f61644e307def7e6b5e193af2e2d46421b5d
Parents: a1b72b7
Author: Hans <fi...@gmail.com>
Authored: Fri Mar 30 15:03:17 2018 -0500
Committer: Hans <fi...@gmail.com>
Committed: Fri Mar 30 15:03:17 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   |   21 +-
 .../extractor/rdfa/RDFa11ExtractorTest.java     |   10 +
 .../test/resources/html/BBC_News_Scotland.html  | 3780 ++++++++++++++++++
 3 files changed, 3802 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 61b58c1..1882ed9 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -29,12 +29,14 @@ import org.eclipse.rdf4j.rio.RDFParser;
 import org.eclipse.rdf4j.rio.RDFHandlerException;
 import org.eclipse.rdf4j.rio.RioSetting;
 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
+import org.jsoup.nodes.Comment;
 import org.jsoup.nodes.DataNode;
 import org.jsoup.nodes.Document;
+import org.jsoup.nodes.DocumentType;
 import org.jsoup.nodes.Entities;
 import org.jsoup.nodes.Node;
+import org.jsoup.select.NodeFilter;
 import org.jsoup.select.NodeTraversor;
-import org.jsoup.select.NodeVisitor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -131,17 +133,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
                         .syntax(Document.OutputSettings.Syntax.xml)
                         .escapeMode(Entities.EscapeMode.xhtml)
                         .charset(charset);
-                //Delete scripts. Json-ld in script tags is extracted first
-                //from tag soup dom, so we should be fine.
-                NodeTraversor.traverse(new NodeVisitor() {
+                // Delete scripts, comments, and doctypes
+                // See https://issues.apache.org/jira/browse/ANY23-317
+                // and https://issues.apache.org/jira/browse/ANY23-340
+                NodeTraversor.filter(new NodeFilter() {
                     @Override
-                    public void head(Node node, int depth) {
-                        if (node instanceof DataNode) {
-                            ((DataNode) node).setWholeData("");
-                        }
+                    public FilterResult head(Node node, int depth) {
+                        return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType
+                                ? FilterResult.REMOVE : FilterResult.CONTINUE;
                     }
                     @Override
-                    public void tail(Node node, int depth) {
+                    public FilterResult tail(Node node, int depth) {
+                        return FilterResult.CONTINUE;
                     }
                 }, doc);
 

http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
index 0599aaf..c0767c9 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
@@ -61,6 +61,16 @@ public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase {
     }
 
     @Test
+    public void testBBCNewsScotland() {
+        assertExtract("/html/BBC_News_Scotland.html");
+        assertModelNotEmpty();
+        assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1);
+        assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1);
+        assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1);
+        assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8);
+    }
+
+    @Test
     public void testIssue326() {
         assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
     }