You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/02 14:22:09 UTC

(tika) branch main updated: Fix html parsing logic when body contains script tag (#1522)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 8ac572352 Fix html parsing logic when body contains script tag (#1522)
8ac572352 is described below

commit 8ac572352a5f9dff5fb654b696231c9b7656b6ac
Author: Yakiv Yereskovskyi <ya...@gmail.com>
AuthorDate: Tue Jan 2 14:22:03 2024 +0000

    Fix html parsing logic when body contains script tag (#1522)
    
    * Fix html parsing logic when body contains script tag
---
 .../src/main/java/org/apache/tika/parser/html/JSoupParser.java     | 2 +-
 .../src/test/java/org/apache/tika/parser/html/HtmlParserTest.java  | 7 +++++++
 .../src/test/resources/test-documents/testHTML_script_in_body.html | 7 +++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 41e6f6eed..b9bc4db8b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -199,7 +199,7 @@ public class JSoupParser extends AbstractEncodingDetectorParser {
 
         @Override
         public NodeFilter.FilterResult tail(Node node, int i) {
-            if (node instanceof TextNode) {
+            if (node instanceof TextNode || node instanceof DataNode) {
                 return FilterResult.CONTINUE;
             }
             try {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index bd12ab401..279040bcf 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1091,6 +1091,13 @@ public class HtmlParserTest extends TikaTest {
         assertNotContained("cool", xml);
     }
 
+    @Test
+    public void testScriptInBody() throws Exception {
+        String xml = getXML("testHTML_script_in_body.html").xml;
+        assertContains("This is a test", xml);
+        assertNotContained("cool", xml);
+    }
+
     @Test
     public void testExtractScript() throws Exception {
         JSoupParser p = new JSoupParser();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html
new file mode 100644
index 000000000..52569cd9a
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html
@@ -0,0 +1,7 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<body>
+<script lang="javascript">cool script</script>
+<p>This is a test.</p>
+</body>
+</html>
\ No newline at end of file