You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/16 17:59:42 UTC

[tika] branch TIKA-3109 created (now 9c6eb0c9d)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3109
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 9c6eb0c9d TIKA-3109 -- parse iframe's srcdoc as an embedded document

This branch includes the following new commits:

     new 9c6eb0c9d TIKA-3109 -- parse iframe's srcdoc as an embedded document

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3109 -- parse iframe's srcdoc as an embedded document

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3109
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9c6eb0c9d40a0ebd5ea1e185ecd86ab3091c5a74
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 16 13:59:27 2023 -0400

    TIKA-3109 -- parse iframe's srcdoc as an embedded document
---
 CHANGES.txt                                        |  2 +
 .../org/apache/tika/parser/html/HtmlHandler.java   | 21 +++++++++++
 .../org/apache/tika/parser/html/SrcDocTest.java    | 44 ++++++++++++++++++++++
 .../test/resources/test-documents/testSrcDoc.html  | 22 +++++++++++
 4 files changed, 89 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 55bd83671..fe13ee36c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.8.1 - ???
 
    * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116).
 
+   * Parse iframe's srcdoc as an embedded file (TIKA-3109).
+
    * Changed default decompressConcatenated to true in CompressorParser.
      Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index ae2de95d5..79c48aaea 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -187,6 +187,11 @@ class HtmlHandler extends TextContentHandler {
                 handleDataURIScheme(value);
             }
         }
+        String srcDoc = atts.getValue("srcdoc");
+        if (! StringUtils.isBlank(srcDoc)) {
+            //check for iframe?
+            handleSrcDoc(srcDoc);
+        }
     }
 
     /**
@@ -339,6 +344,22 @@ class HtmlHandler extends TextContentHandler {
             discardLevel--;
         }
     }
+    private void handleSrcDoc(String string) throws SAXException {
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+        m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html");
+        //TODO add metadata about iframe content?
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+            try (InputStream stream = new UnsynchronizedByteArrayInputStream(string.getBytes(StandardCharsets.UTF_8))) {
+                embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true);
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+            }
+        }
+    }
 
     private void handleDataURIScheme(String string) throws SAXException {
         DataURIScheme dataURIScheme = null;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java
new file mode 100644
index 000000000..64f1189f3
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class SrcDocTest extends TikaTest {
+
+
+    @Test
+    public void testBasic() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testSrcDoc.html");
+        debug(metadataList);
+        assertEquals(2, metadataList.size());
+        assertContains("outside", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+        assertContains("this is the iframe content",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString(),
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testSrcDoc.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testSrcDoc.html
new file mode 100644
index 000000000..8d33de7d8
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testSrcDoc.html
@@ -0,0 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+outside
+<iframe srcdoc="<p>this is the iframe content</p>"></iframe>
+</body>
+</html>
\ No newline at end of file