You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by le...@apache.org on 2021/10/20 18:04:19 UTC
[any23] branch master updated: ANY23-504 XML-based parsers should
not load external DTDs by default (#205)
This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/any23.git
The following commit(s) were added to refs/heads/master by this push:
new c69f59a ANY23-504 XML-based parsers should not load external DTDs by default (#205)
c69f59a is described below
commit c69f59ad8233542b6e218b6ac0c3cd0f9f614487
Author: Lewis John McGibbney <le...@gmail.com>
AuthorDate: Wed Oct 20 11:04:11 2021 -0700
ANY23-504 XML-based parsers should not load external DTDs by default (#205)
* ANY23-504 XML-based parsers should not load external DTDs by default
---
.../any23/extractor/SingleDocumentExtraction.java | 3 +-
.../any23/extractor/rdf/RDFParserFactory.java | 2 ++
core/src/test/java/org/apache/any23/Any23Test.java | 26 +++++++++++++++++
.../extractor/SingleDocumentExtractionTest.java | 33 ++++++++++++++++++++++
.../any23/extractor/rdf/JSONLDExtractorTest.java | 1 -
5 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
index a599cdf..a09e8fe 100644
--- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
+++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -71,7 +71,8 @@ import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
/**
- * This class acts as facade where all the extractors were called on a single document.
+ * This class acts as a facade where all extractors (for a given MIMEType) can be called on a single document.
+ * Extractors are automatically filtered by MIMEType.
*/
public class SingleDocumentExtraction {
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
index 6a64048..579b224 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
@@ -31,6 +31,7 @@ import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
import org.eclipse.rdf4j.rio.helpers.RDFaVersion;
+import org.eclipse.rdf4j.rio.helpers.XMLParserSettings;
import org.eclipse.rdf4j.rio.turtle.TurtleParser;
import org.semanticweb.owlapi.rio.OWLAPIRDFFormat;
import org.slf4j.Logger;
@@ -295,6 +296,7 @@ public class RDFParserFactory {
final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
parser.getParserConfig().setNonFatalErrors(
stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
+ parser.getParserConfig().set(XMLParserSettings.LOAD_EXTERNAL_DTD, false);
parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java
index da5534a..1bdd12e 100644
--- a/core/src/test/java/org/apache/any23/Any23Test.java
+++ b/core/src/test/java/org/apache/any23/Any23Test.java
@@ -519,6 +519,32 @@ public class Any23Test extends Any23OnlineTestBase {
}
/**
+ * Test whether the {@link Any23} facade can be overloaded with a intentional <code>rdf-xml</code> extactor
+ * (mis)configuration which we then attempt to use to process a <code>application/xhtml+xml</code> document. The
+ * expected behaviour is that the call to
+ * {@link org.apache.any23.extractor.SingleDocumentExtraction#run(ExtractionParameters)} will ultimately filter the
+ * extractors based upon the detected mimetype. This results in no available extractors and a largely empty
+ * {@link ExtractionReport}.
+ *
+ * @throws Exception
+ * if a extraction anomaly arises
+ */
+ @Test
+ public void testMisconfiguredAny23FacadeForInputData() throws Exception {
+ Any23 runner = new Any23("rdf-xml");
+ CountingTripleHandler handler = new CountingTripleHandler();
+ ExtractionReport report = runner.extract(
+ IOUtils.resourceToString("/html/BBC_News_Scotland.html", StandardCharsets.UTF_8),
+ "http://www.bbc.co.uk/news/scotland/", handler);
+ Assert.assertEquals("application/xhtml+xml", report.getDetectedMimeType());
+ Assert.assertEquals(0, report.getExtractorIssues("rdf-xml").size());
+ Assert.assertEquals(0, report.getMatchingExtractors().size());
+ Assert.assertEquals(0, handler.getCount());
+ Assert.assertEquals(report.getValidationReport().getClass().getName(),
+ "org.apache.any23.validator.EmptyValidationReport");
+ }
+
+ /**
* Performs detection and extraction on the given input string and return the {@link ExtractionReport}.
*
* @param in
diff --git a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
index f4ca0ae..6495571 100644
--- a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
@@ -21,6 +21,7 @@ import org.apache.any23.AbstractAny23TestBase;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.configuration.ModifiableConfiguration;
import org.apache.any23.extractor.html.HTMLFixture;
+import org.apache.any23.extractor.rdf.TriXExtractor;
import org.apache.any23.mime.TikaMIMETypeDetector;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.vocab.ICAL;
@@ -50,6 +51,9 @@ import org.eclipse.rdf4j.sail.memory.MemoryStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
@@ -245,6 +249,35 @@ public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
}
+ /**
+ * Tests that the {@link org.apache.any23.extractor.rdf.TriXExtractor} is NOT activated for a given HTML document.
+ * This tests that a private method within {@link org.apache.any23.extractor.SingleDocumentExtraction} works as
+ * expected.
+ *
+ * @see <a href=
+ * "https://issues.apache.org/jira/browse/ANY23-504">https://issues.apache.org/jira/browse/ANY23-504</a>
+ *
+ * @throws IOException
+ * if there is an error loading input data
+ * @throws ExtractionException
+ * if an exception is raised during extraction
+ * @throws RepositoryException
+ * if an error is encountered whilst loading content from a storage connection
+ */
+ @Test
+ public void testTrixParserNotActivatedAfterFilterExtractorsByMIMEType()
+ throws IOException, ExtractionException, RepositoryException {
+ singleDocumentExtraction = getInstance("/html/BBC_News_Scotland.html");
+ assertTrue(singleDocumentExtraction.hasMatchingExtractors());
+ assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
+ .anyMatch(e -> TriXExtractor.class.isInstance(e)));
+ singleDocumentExtraction.run();
+ assertFalse(singleDocumentExtraction.getMatchingExtractors().stream()
+ .anyMatch(e -> TriXExtractor.class.isInstance(e)));
+
+ logStorageContent();
+ }
+
private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
baos = new ByteArrayOutputStream();
rdfxmlWriter = new RDFXMLWriter(baos);
diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
index dc8e9eb..033aa1f 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
@@ -72,7 +72,6 @@ public class JSONLDExtractorTest {
for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
- @SuppressWarnings("resource")
InputStream stream = new JsonCleaningInputStream(new ByteArrayInputStream(bytes));
if (i == '\r' || i == '\n') {
Assert.assertEquals(stream.read(), i);