You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2021/10/20 18:04:19 UTC

[any23] branch master updated: ANY23-504 XML-based parsers should not load external DTDs by default (#205)

This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/any23.git


The following commit(s) were added to refs/heads/master by this push:
     new c69f59a  ANY23-504 XML-based parsers should not load external DTDs by default (#205)
c69f59a is described below

commit c69f59ad8233542b6e218b6ac0c3cd0f9f614487
Author: Lewis John McGibbney <le...@gmail.com>
AuthorDate: Wed Oct 20 11:04:11 2021 -0700

    ANY23-504 XML-based parsers should not load external DTDs by default (#205)
    
    * ANY23-504 XML-based parsers should not load external DTDs by default
---
 .../any23/extractor/SingleDocumentExtraction.java  |  3 +-
 .../any23/extractor/rdf/RDFParserFactory.java      |  2 ++
 core/src/test/java/org/apache/any23/Any23Test.java | 26 +++++++++++++++++
 .../extractor/SingleDocumentExtractionTest.java    | 33 ++++++++++++++++++++++
 .../any23/extractor/rdf/JSONLDExtractorTest.java   |  1 -
 5 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index a599cdf..a09e8fe 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -71,7 +71,8 @@ import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
 
 /**
- * This class acts as facade where all the extractors were called on a single document.
+ * This class acts as a facade where all extractors (for a given MIMEType) can be called on a single document.
+ * Extractors are automatically filtered by MIMEType.
  */
 public class SingleDocumentExtraction {
 
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
index 6a64048..579b224 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
@@ -31,6 +31,7 @@ import org.eclipse.rdf4j.rio.Rio;
 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
 import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
 import org.eclipse.rdf4j.rio.helpers.RDFaVersion;
+import org.eclipse.rdf4j.rio.helpers.XMLParserSettings;
 import org.eclipse.rdf4j.rio.turtle.TurtleParser;
 import org.semanticweb.owlapi.rio.OWLAPIRDFFormat;
 import org.slf4j.Logger;
@@ -295,6 +296,7 @@ public class RDFParserFactory {
             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
         parser.getParserConfig().setNonFatalErrors(
                 stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
+        parser.getParserConfig().set(XMLParserSettings.LOAD_EXTERNAL_DTD, false);
         parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
         parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
 
diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java
index da5534a..1bdd12e 100644
--- a/core/src/test/java/org/apache/any23/Any23Test.java
+++ b/core/src/test/java/org/apache/any23/Any23Test.java
@@ -519,6 +519,32 @@ public class Any23Test extends Any23OnlineTestBase {
     }
 
     /**
+     * Test whether the {@link Any23} facade can be overloaded with a intentional <code>rdf-xml</code> extactor
+     * (mis)configuration which we then attempt to use to process a <code>application/xhtml+xml</code> document. The
+     * expected behaviour is that the call to
+     * {@link org.apache.any23.extractor.SingleDocumentExtraction#run(ExtractionParameters)} will ultimately filter the
+     * extractors based upon the detected mimetype. This results in no available extractors and a largely empty
+     * {@link ExtractionReport}.
+     * 
+     * @throws Exception
+     *             if a extraction anomaly arises
+     */
+    @Test
+    public void testMisconfiguredAny23FacadeForInputData() throws Exception {
+        Any23 runner = new Any23("rdf-xml");
+        CountingTripleHandler handler = new CountingTripleHandler();
+        ExtractionReport report = runner.extract(
+                IOUtils.resourceToString("/html/BBC_News_Scotland.html", StandardCharsets.UTF_8),
+                "http://www.bbc.co.uk/news/scotland/", handler);
+        Assert.assertEquals("application/xhtml+xml", report.getDetectedMimeType());
+        Assert.assertEquals(0, report.getExtractorIssues("rdf-xml").size());
+        Assert.assertEquals(0, report.getMatchingExtractors().size());
+        Assert.assertEquals(0, handler.getCount());
+        Assert.assertEquals(report.getValidationReport().getClass().getName(),
+                "org.apache.any23.validator.EmptyValidationReport");
+    }
+
+    /**
      * Performs detection and extraction on the given input string and return the {@link ExtractionReport}.
      * 
      * @param in
diff --git a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
index f4ca0ae..6495571 100644
--- a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
@@ -21,6 +21,7 @@ import org.apache.any23.AbstractAny23TestBase;
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.configuration.ModifiableConfiguration;
 import org.apache.any23.extractor.html.HTMLFixture;
+import org.apache.any23.extractor.rdf.TriXExtractor;
 import org.apache.any23.mime.TikaMIMETypeDetector;
 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 import org.apache.any23.vocab.ICAL;
@@ -50,6 +51,9 @@ import org.eclipse.rdf4j.sail.memory.MemoryStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
 import java.io.ByteArrayOutputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
@@ -245,6 +249,35 @@ public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
     }
 
+    /**
+     * Tests that the {@link org.apache.any23.extractor.rdf.TriXExtractor} is NOT activated for a given HTML document.
+     * This tests that a private method within {@link org.apache.any23.extractor.SingleDocumentExtraction} works as
+     * expected.
+     * 
+     * @see <a href=
+     *      "https://issues.apache.org/jira/browse/ANY23-504">https://issues.apache.org/jira/browse/ANY23-504</a>
+     *
+     * @throws IOException
+     *             if there is an error loading input data
+     * @throws ExtractionException
+     *             if an exception is raised during extraction
+     * @throws RepositoryException
+     *             if an error is encountered whilst loading content from a storage connection
+     */
+    @Test
+    public void testTrixParserNotActivatedAfterFilterExtractorsByMIMEType()
+            throws IOException, ExtractionException, RepositoryException {
+        singleDocumentExtraction = getInstance("/html/BBC_News_Scotland.html");
+        assertTrue(singleDocumentExtraction.hasMatchingExtractors());
+        assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
+                .anyMatch(e -> TriXExtractor.class.isInstance(e)));
+        singleDocumentExtraction.run();
+        assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
+                .anyMatch(e -> TriXExtractor.class.isInstance(e)));
+
+        logStorageContent();
+    }
+
     private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
         baos = new ByteArrayOutputStream();
         rdfxmlWriter = new RDFXMLWriter(baos);
diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
index dc8e9eb..033aa1f 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
@@ -72,7 +72,6 @@ public class JSONLDExtractorTest {
         for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
             if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
                 byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
-                @SuppressWarnings("resource")
                 InputStream stream = new JsonCleaningInputStream(new ByteArrayInputStream(bytes));
                 if (i == '\r' || i == '\n') {
                     Assert.assertEquals(stream.read(), i);