You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/07/28 15:42:46 UTC

[tika] branch main updated: TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 773bf3bf6 TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard
773bf3bf6 is described below

commit 773bf3bf69751602d0e36cf28f342a27df50fd8f
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 28 11:42:31 2022 -0400

    TIKA-1484 - isolate boilerpipe dependencies to tika-app, tika-bundle-standard and tika-server-standard
---
 CHANGES.txt                                        |   7 +
 tika-app/pom.xml                                   |   5 +
 tika-bundles/tika-bundle-standard/pom.xml          |   5 +
 .../tika-parser-html-commons/README.md             |   4 +
 .../tika-parser-html-module/pom.xml                |   5 -
 .../apache/tika/parser/html/HtmlParserTest.java    | 105 --------------
 .../tika-parsers-standard-package/pom.xml          |   6 +
 .../org/apache/tika/sax/BoilerpipeHandlerTest.java | 160 +++++++++++++++++++++
 tika-server/tika-server-standard/pom.xml           |   5 +
 9 files changed, 192 insertions(+), 110 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index ca1fbfa28..779fe25f6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,12 @@
 Release 2.4.2 - ???
 
+   * tika-parser-html-commons (BoilerPipeHandler) is no longer a
+     a dependency of tika-parser-html-module. tika-app and tika-server-standard
+     have added a dependency on tika-parser-html-commons.  However,
+     users who are managing custom dependencies and who want the BoilerPipeHandler
+     will have to now include the tika-parser-html-commons dependency
+     (TIKA-1484).
+
    * Add unrar as an optional parser (TIKA-3800).
 
    * Refactor FuzzingCLI to use PipesParser (TIKA-3799).
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index e3571285d..ce94ab2b8 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -43,6 +43,11 @@
       <artifactId>tika-parsers-standard-package</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parser-html-commons</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-langdetect-optimaize</artifactId>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index ca3bee097..c8775c584 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -52,6 +52,11 @@
       <artifactId>tika-parsers-standard-package</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-html-commons</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.logging.log4j</groupId>
       <artifactId>log4j-api</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
new file mode 100644
index 000000000..9fadc6144
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
@@ -0,0 +1,4 @@
+This module only contains the BoilerPipeContentHandler.  The boilerpipe dependency is no 
+longer maintained and contains clashes with NekoHTML.
+
+In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar.
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
index 2410d2940..0f9af79ce 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
@@ -30,11 +30,6 @@
   <name>Apache Tika html parser module</name>
 
   <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
-      <version>${project.version}</version>
-    </dependency>
     <dependency>
       <groupId>org.ccil.cowan.tagsoup</groupId>
       <artifactId>tagsoup</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 6da84498b..093e2af34 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -80,7 +80,6 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
 
 public class HtmlParserTest extends TikaTest {
 
@@ -412,28 +411,6 @@ public class HtmlParserTest extends TikaTest {
         assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
-    /**
-     * Test case for TIKA-420
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
-     */
-    @Test
-    public void testBoilerplateRemoval() throws Exception {
-        String path = "/test-documents/boilerplate.html";
-
-        Metadata metadata = new Metadata();
-        BodyContentHandler handler = new BodyContentHandler();
-        new HtmlParser()
-                .parse(getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata,
-                        new ParseContext());
-
-        String content = handler.toString();
-        assertTrue(content.startsWith("This is the real meat"));
-        assertTrue(content.endsWith("This is the end of the text.\n"));
-        assertFalse(content.contains("boilerplate"));
-        assertFalse(content.contains("footer"));
-    }
-
     /**
      * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
      *
@@ -740,33 +717,6 @@ public class HtmlParserTest extends TikaTest {
         return handler;
     }
 
-    /**
-     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
-     */
-    @Test
-    public void testBoilerplateWithMarkup() throws Exception {
-        String path = "/test-documents/boilerplate.html";
-
-        Metadata metadata = new Metadata();
-        StringWriter sw = new StringWriter();
-        ContentHandler ch = makeHtmlTransformer(sw);
-        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
-        bpch.setIncludeMarkup(true);
-
-        new HtmlParser().parse(getResourceAsStream(path), bpch, metadata, new ParseContext());
-
-        String content = sw.toString();
-        assertTrue(content.contains("<body><table><tr><td><table><tr><td>"),
-                "Has empty table elements");
-        assertTrue(content.contains("<a shape=\"rect\" href=\"Main.php\"/>"), "Has empty a element");
-        assertTrue(content.contains("<p>This is the real meat"), "Has real content");
-        assertTrue(content.endsWith("</p></body></html>"), "Ends with appropriate HTML");
-        assertFalse(content.contains("boilerplate"));
-        assertFalse(content.contains("footer"));
-    }
-
     /**
      * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
      */
@@ -843,61 +793,6 @@ public class HtmlParserTest extends TikaTest {
                 "Missing HTML lang attribute");
     }
 
-    /**
-     * Test case for TIKA-961
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
-     */
-    @Test
-    public void testBoilerplateWhitespace() throws Exception {
-        String path = "/test-documents/boilerplate-whitespace.html";
-
-        Metadata metadata = new Metadata();
-        BodyContentHandler handler = new BodyContentHandler();
-
-        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
-        bpHandler.setIncludeMarkup(true);
-
-        new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
-
-        String content = handler.toString();
-
-        // Should not contain item_aitem_b
-        assertFalse(content.contains("item_aitem_b"));
-
-        // Should contain the two list items with a newline in between.
-        assertContains("item_a\nitem_b", content);
-
-        // Should contain 有什么需要我帮你的 (can i help you) without whitespace
-        assertContains("有什么需要我帮你的", content);
-    }
-
-    /**
-     * Test case for TIKA-2683
-     *
-     * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
-     */
-    @Test
-    public void testBoilerplateMissingWhitespace() throws Exception {
-        String path = "/test-documents/testBoilerplateMissingSpace.html";
-
-        Metadata metadata = new Metadata();
-        BodyContentHandler handler = new BodyContentHandler();
-
-        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
-        bpHandler.setIncludeMarkup(true);
-
-        new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
-
-        String content = handler.toString();
-
-        // Should contain space between these two words as mentioned in HTML
-        assertContains("family Psychrolutidae", content);
-
-        // Shouldn't add new-line chars around brackets; This is not how the HTML look
-        assertContains("(Psychrolutes marcidus)", content);
-    }
-
     /**
      * Test case for TIKA-983:  HTML parser should add Open Graph
      * meta tag data to Metadata returned by parser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 07cdb5191..1c3d31918 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -185,6 +185,12 @@
       <version>${imageio.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-html-commons</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-core</artifactId>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
new file mode 100644
index 000000000..ddffc0919
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringWriter;
+import java.io.Writer;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
+
+public class BoilerpipeHandlerTest extends TikaTest {
+    /**
+     * Test case for TIKA-420
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
+     */
+    @Test
+    public void testBoilerplateRemoval() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+        new HtmlParser()
+                .parse(getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata,
+                        new ParseContext());
+
+        String content = handler.toString();
+        assertTrue(content.startsWith("This is the real meat"));
+        assertTrue(content.endsWith("This is the end of the text.\n"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
+     */
+    @Test
+    public void testBoilerplateWithMarkup() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        ContentHandler ch = makeHtmlTransformer(sw);
+        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+        bpch.setIncludeMarkup(true);
+
+        new HtmlParser().parse(getResourceAsStream(path), bpch, metadata, new ParseContext());
+
+        String content = sw.toString();
+        assertTrue(content.contains("<body><table><tr><td><table><tr><td>"),
+                "Has empty table elements");
+        assertTrue(content.contains("<a shape=\"rect\" href=\"Main.php\"/>"), "Has empty a element");
+        assertTrue(content.contains("<p>This is the real meat"), "Has real content");
+        assertTrue(content.endsWith("</p></body></html>"), "Ends with appropriate HTML");
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+
+    /**
+     * Test case for TIKA-961
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
+     */
+    @Test
+    public void testBoilerplateWhitespace() throws Exception {
+        String path = "/test-documents/boilerplate-whitespace.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+
+        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+        bpHandler.setIncludeMarkup(true);
+
+        new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
+
+        String content = handler.toString();
+
+        // Should not contain item_aitem_b
+        assertFalse(content.contains("item_aitem_b"));
+
+        // Should contain the two list items with a newline in between.
+        assertContains("item_a\nitem_b", content);
+
+        // Should contain 有什么需要我帮你的 (can i help you) without whitespace
+        assertContains("有什么需要我帮你的", content);
+    }
+
+    /**
+     * Test case for TIKA-2683
+     *
+     * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
+     */
+    @Test
+    public void testBoilerplateMissingWhitespace() throws Exception {
+        String path = "/test-documents/testBoilerplateMissingSpace.html";
+
+        Metadata metadata = new Metadata();
+        BodyContentHandler handler = new BodyContentHandler();
+
+        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
+        bpHandler.setIncludeMarkup(true);
+
+        new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());
+
+        String content = handler.toString();
+
+        // Should contain space between these two words as mentioned in HTML
+        assertContains("family Psychrolutidae", content);
+
+        // Shouldn't add new-line chars around brackets; This is not how the HTML look
+        assertContains("(Psychrolutes marcidus)", content);
+    }
+
+    /**
+     * Create ContentHandler that transforms SAX events into textual HTML output,
+     * and writes it out to <writer> - typically this is a StringWriter.
+     *
+     * @param writer Where to write resulting HTML text.
+     * @return ContentHandler suitable for passing to parse() methods.
+     * @throws Exception
+     */
+    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
+        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
+        handler.setResult(new StreamResult(writer));
+        return handler;
+    }
+}
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index 3c0932352..b552dfca4 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -49,6 +49,11 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parser-html-commons</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-xmp</artifactId>