You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/16 18:01:47 UTC

[tika] branch TIKA-3109 updated: TIKA-3109 -- parse iframe's srcdoc as an embedded document -- add check for iframe and remove debug

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3109
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3109 by this push:
     new 83793014f TIKA-3109 -- parse iframe's srcdoc as an embedded document -- add check for iframe and remove debug
83793014f is described below

commit 83793014f588214152e5e92c7653b36a63739432
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 16 14:01:39 2023 -0400

    TIKA-3109 -- parse iframe's srcdoc as an embedded document -- add check for iframe and remove debug
---
 .../src/main/java/org/apache/tika/parser/html/HtmlHandler.java   | 9 +++++----
 .../src/test/java/org/apache/tika/parser/html/SrcDocTest.java    | 1 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 79c48aaea..3050422c4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -187,10 +187,11 @@ class HtmlHandler extends TextContentHandler {
                 handleDataURIScheme(value);
             }
         }
-        String srcDoc = atts.getValue("srcdoc");
-        if (! StringUtils.isBlank(srcDoc)) {
-            //check for iframe?
-            handleSrcDoc(srcDoc);
+        if ("IFRAME".equals(name)) {
+            String srcDoc = atts.getValue("srcdoc");
+            if (!StringUtils.isBlank(srcDoc)) {
+                handleSrcDoc(srcDoc);
+            }
         }
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java
index 64f1189f3..e7f6d1c70 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/SrcDocTest.java
@@ -33,7 +33,6 @@ public class SrcDocTest extends TikaTest {
     @Test
     public void testBasic() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testSrcDoc.html");
-        debug(metadataList);
         assertEquals(2, metadataList.size());
         assertContains("outside", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
         assertContains("this is the iframe content",