You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/02 14:22:09 UTC
(tika) branch main updated: Fix html parsing logic when body contains script tag (#1522)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8ac572352 Fix html parsing logic when body contains script tag (#1522)
8ac572352 is described below
commit 8ac572352a5f9dff5fb654b696231c9b7656b6ac
Author: Yakiv Yereskovskyi <ya...@gmail.com>
AuthorDate: Tue Jan 2 14:22:03 2024 +0000
Fix html parsing logic when body contains script tag (#1522)
* Fix html parsing logic when body contains script tag
---
.../src/main/java/org/apache/tika/parser/html/JSoupParser.java | 2 +-
.../src/test/java/org/apache/tika/parser/html/HtmlParserTest.java | 7 +++++++
.../src/test/resources/test-documents/testHTML_script_in_body.html | 7 +++++++
3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
index 41e6f6eed..b9bc4db8b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
@@ -199,7 +199,7 @@ public class JSoupParser extends AbstractEncodingDetectorParser {
@Override
public NodeFilter.FilterResult tail(Node node, int i) {
- if (node instanceof TextNode) {
+ if (node instanceof TextNode || node instanceof DataNode) {
return FilterResult.CONTINUE;
}
try {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index bd12ab401..279040bcf 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1091,6 +1091,13 @@ public class HtmlParserTest extends TikaTest {
assertNotContained("cool", xml);
}
+ @Test
+ public void testScriptInBody() throws Exception {
+ String xml = getXML("testHTML_script_in_body.html").xml;
+ assertContains("This is a test", xml);
+ assertNotContained("cool", xml);
+ }
+
@Test
public void testExtractScript() throws Exception {
JSoupParser p = new JSoupParser();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html
new file mode 100644
index 000000000..52569cd9a
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_script_in_body.html
@@ -0,0 +1,7 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<body>
+<script lang="javascript">cool script</script>
+<p>This is a test.</p>
+</body>
+</html>
\ No newline at end of file