You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/07/28 15:42:46 UTC
[tika] branch main updated: TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 773bf3bf6 TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard
773bf3bf6 is described below
commit 773bf3bf69751602d0e36cf28f342a27df50fd8f
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 28 11:42:31 2022 -0400
TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard
---
CHANGES.txt | 7 +
tika-app/pom.xml | 5 +
tika-bundles/tika-bundle-standard/pom.xml | 5 +
.../tika-parser-html-commons/README.md | 4 +
.../tika-parser-html-module/pom.xml | 5 -
.../apache/tika/parser/html/HtmlParserTest.java | 105 --------------
.../tika-parsers-standard-package/pom.xml | 6 +
.../org/apache/tika/sax/BoilerpipeHandlerTest.java | 160 +++++++++++++++++++++
tika-server/tika-server-standard/pom.xml | 5 +
9 files changed, 192 insertions(+), 110 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index ca1fbfa28..779fe25f6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,12 @@
Release 2.4.2 - ???
+ * tika-parser-html-commons (BoilerPipeHandler) is no longer a
+ a dependency of tika-parser-html-module. tika-app and tika-server-standard
+ have added a dependency on tika-parser-html-commons. However,
+ users who are managing custom dependencies and who want the BoilerPipeHandler
+ will have to now include the tika-parser-html-commons dependency
+ (TIKA-1484).
+
* Add unrar as an optional parser (TIKA-3800).
* Refactor FuzzingCLI to use PipesParser (TIKA-3799).
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index e3571285d..ce94ab2b8 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -43,6 +43,11 @@
<artifactId>tika-parsers-standard-package</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-html-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-langdetect-optimaize</artifactId>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index ca3bee097..c8775c584 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -52,6 +52,11 @@
<artifactId>tika-parsers-standard-package</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-html-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
new file mode 100644
index 000000000..9fadc6144
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
@@ -0,0 +1,4 @@
+This module only contains the BoilerPipeContentHandler. The boilerpipe dependency is no
+longer maintained and contains clashes with NekoHTML.
+
+In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar.
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
index 2410d2940..0f9af79ce 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
@@ -30,11 +30,6 @@
<name>Apache Tika html parser module</name>
<dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
<dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 6da84498b..093e2af34 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -80,7 +80,6 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
public class HtmlParserTest extends TikaTest {
@@ -412,28 +411,6 @@ public class HtmlParserTest extends TikaTest {
assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
}
- /**
- * Test case for TIKA-420
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
- */
- @Test
- public void testBoilerplateRemoval() throws Exception {
- String path = "/test-documents/boilerplate.html";
-
- Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
- new HtmlParser()
- .parse(getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata,
- new ParseContext());
-
- String content = handler.toString();
- assertTrue(content.startsWith("This is the real meat"));
- assertTrue(content.endsWith("This is the end of the text.\n"));
- assertFalse(content.contains("boilerplate"));
- assertFalse(content.contains("footer"));
- }
-
/**
* Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
*
@@ -740,33 +717,6 @@ public class HtmlParserTest extends TikaTest {
return handler;
}
- /**
- * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
- */
- @Test
- public void testBoilerplateWithMarkup() throws Exception {
- String path = "/test-documents/boilerplate.html";
-
- Metadata metadata = new Metadata();
- StringWriter sw = new StringWriter();
- ContentHandler ch = makeHtmlTransformer(sw);
- BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
- bpch.setIncludeMarkup(true);
-
- new HtmlParser().parse(getResourceAsStream(path), bpch, metadata, new ParseContext());
-
- String content = sw.toString();
- assertTrue(content.contains("<body><table><tr><td><table><tr><td>"),
- "Has empty table elements");
- assertTrue(content.contains("<a shape=\"rect\" href=\"Main.php\"/>"), "Has empty a element");
- assertTrue(content.contains("<p>This is the real meat"), "Has real content");
- assertTrue(content.endsWith("</p></body></html>"), "Ends with appropriate HTML");
- assertFalse(content.contains("boilerplate"));
- assertFalse(content.contains("footer"));
- }
-
/**
* Test case for TIKA-434 - Pushback buffer overflow in TagSoup
*/
@@ -843,61 +793,6 @@ public class HtmlParserTest extends TikaTest {
"Missing HTML lang attribute");
}
- /**
- * Test case for TIKA-961
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
- */
- @Test
- public void testBoilerplateWhitespace() throws Exception {
- String path = "/test-documents/boilerplate-whitespace.html";
-
- Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
-
- BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
- bpHandler.setIncludeMarkup(true);
-
- new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
-
- String content = handler.toString();
-
- // Should not contain item_aitem_b
- assertFalse(content.contains("item_aitem_b"));
-
- // Should contain the two list items with a newline in between.
- assertContains("item_a\nitem_b", content);
-
- // Should contain 有什么需要我帮你的 (can i help you) without whitespace
- assertContains("有什么需要我帮你的", content);
- }
-
- /**
- * Test case for TIKA-2683
- *
- * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
- */
- @Test
- public void testBoilerplateMissingWhitespace() throws Exception {
- String path = "/test-documents/testBoilerplateMissingSpace.html";
-
- Metadata metadata = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
-
- BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
- bpHandler.setIncludeMarkup(true);
-
- new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
-
- String content = handler.toString();
-
- // Should contain space between these two words as mentioned in HTML
- assertContains("family Psychrolutidae", content);
-
- // Shouldn't add new-line chars around brackets; This is not how the HTML look
- assertContains("(Psychrolutes marcidus)", content);
- }
-
/**
* Test case for TIKA-983: HTML parser should add Open Graph
* meta tag data to Metadata returned by parser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 07cdb5191..1c3d31918 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -185,6 +185,12 @@
<version>${imageio.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-html-commons</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-core</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
new file mode 100644
index 000000000..ddffc0919
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringWriter;
+import java.io.Writer;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
+
+public class BoilerpipeHandlerTest extends TikaTest {
+ /**
+ * Test case for TIKA-420
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
+ */
+ @Test
+ public void testBoilerplateRemoval() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ new HtmlParser()
+ .parse(getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata,
+ new ParseContext());
+
+ String content = handler.toString();
+ assertTrue(content.startsWith("This is the real meat"));
+ assertTrue(content.endsWith("This is the end of the text.\n"));
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
+ /**
+ * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
+ */
+ @Test
+ public void testBoilerplateWithMarkup() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ StringWriter sw = new StringWriter();
+ ContentHandler ch = makeHtmlTransformer(sw);
+ BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+ bpch.setIncludeMarkup(true);
+
+ new HtmlParser().parse(getResourceAsStream(path), bpch, metadata, new ParseContext());
+
+ String content = sw.toString();
+ assertTrue(content.contains("<body><table><tr><td><table><tr><td>"),
+ "Has empty table elements");
+ assertTrue(content.contains("<a shape=\"rect\" href=\"Main.php\"/>"), "Has empty a element");
+ assertTrue(content.contains("<p>This is the real meat"), "Has real content");
+ assertTrue(content.endsWith("</p></body></html>"), "Ends with appropriate HTML");
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
+ /**
+ * Test case for TIKA-961
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+ */
+ @Test
+ public void testBoilerplateWhitespace() throws Exception {
+ String path = "/test-documents/boilerplate-whitespace.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+ bpHandler.setIncludeMarkup(true);
+
+ new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
+
+ String content = handler.toString();
+
+ // Should not contain item_aitem_b
+ assertFalse(content.contains("item_aitem_b"));
+
+ // Should contain the two list items with a newline in between.
+ assertContains("item_a\nitem_b", content);
+
+ // Should contain 有什么需要我帮你的 (can i help you) without whitespace
+ assertContains("有什么需要我帮你的", content);
+ }
+
+ /**
+ * Test case for TIKA-2683
+ *
+ * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
+ */
+ @Test
+ public void testBoilerplateMissingWhitespace() throws Exception {
+ String path = "/test-documents/testBoilerplateMissingSpace.html";
+
+ Metadata metadata = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+
+ BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+ bpHandler.setIncludeMarkup(true);
+
+ new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
+
+ String content = handler.toString();
+
+ // Should contain space between these two words as mentioned in HTML
+ assertContains("family Psychrolutidae", content);
+
+ // Shouldn't add new-line chars around brackets; This is not how the HTML look
+ assertContains("(Psychrolutes marcidus)", content);
+ }
+
+ /**
+ * Create ContentHandler that transforms SAX events into textual HTML output,
+ * and writes it out to <writer> - typically this is a StringWriter.
+ *
+ * @param writer Where to write resulting HTML text.
+ * @return ContentHandler suitable for passing to parse() methods.
+ * @throws Exception
+ */
+ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
+ SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
+ handler.setResult(new StreamResult(writer));
+ return handler;
+ }
+}
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index 3c0932352..b552dfca4 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -49,6 +49,11 @@
</exclusion>
</exclusions>
</dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-html-commons</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-xmp</artifactId>