You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/22 14:32:03 UTC

[tika] 01/01: TIKA-4138 -- move BoilerpipeContentHandler

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4138
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cbc46ee9b5295bf14541da8d1f016261c5e30196
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 22 10:31:47 2023 -0400

    TIKA-4138 -- move BoilerpipeContentHandler
---
 CHANGES.txt                                        |  5 ++
 pom.xml                                            |  1 +
 tika-app/pom.xml                                   |  2 +-
 tika-bom/pom.xml                                   |  2 +-
 tika-bundles/tika-bundle-standard/pom.xml          |  2 +-
 tika-handlers/README.md                            |  2 +
 tika-handlers/pom.xml                              | 48 ++++++++++++++
 .../tika-handler-boilerpipe/pom.xml                | 26 ++++++--
 .../sax/boilerpipe/BoilerpipeContentHandler.java   |  0
 .../tika-parsers-standard-modules/pom.xml          |  1 -
 .../tika-parser-html-commons/pom.xml               | 74 ----------------------
 .../tika-parsers-standard-package/pom.xml          |  2 +-
 tika-server/tika-server-core/pom.xml               |  2 +-
 tika-server/tika-server-standard/pom.xml           |  6 +-
 14 files changed, 86 insertions(+), 87 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 30c137609..408e42676 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,12 @@
 Release 3.0.0-BETA - ??
 
+   BREAKING CHANGES
+
    * Require Java 11 (TIKA-4128).
 
+   * The boilerpipe handler has been moved to tika-handler-boiler-pipe
+
+   Other Changes/Updates
    * Fix bug in DateUtils that stripped timezone information from
      incoming Calendar objects (TIKA-4126).
 
diff --git a/pom.xml b/pom.xml
index ab6b22afa..31f025576 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,6 +54,7 @@
     <module>tika-example</module>
     <module>tika-java7</module>
     <module>tika-detectors</module>
+    <module>tika-handlers</module>
   </modules>
 
   <profiles>
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 9a48d2ea9..68ac79477 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -45,7 +45,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index ba2e19d73..5e1aca01e 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -222,7 +222,7 @@
       </dependency>
       <dependency>
         <groupId>org.apache.tika</groupId>
-        <artifactId>tika-parser-html-commons</artifactId>
+        <artifactId>tika-handler-boilerpipe</artifactId>
         <version>3.0.0-SNAPSHOT</version>
       </dependency>
       <dependency>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index db605c044..1e18b1cb0 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -58,7 +58,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-handlers/README.md b/tika-handlers/README.md
new file mode 100644
index 000000000..bb45651b3
--- /dev/null
+++ b/tika-handlers/README.md
@@ -0,0 +1,2 @@
+This package is intended to hold non-standard handlers. These may have dependencies that some don't want, 
+or they may have a focus that isn't general enough to warrant adding them to tika-core
\ No newline at end of file
diff --git a/tika-handlers/pom.xml b/tika-handlers/pom.xml
new file mode 100644
index 000000000..fcab3eb20
--- /dev/null
+++ b/tika-handlers/pom.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parent</artifactId>
+    <version>3.0.0-SNAPSHOT</version>
+    <relativePath>../tika-parent/pom.xml</relativePath>
+  </parent>
+
+  <artifactId>tika-handlers</artifactId>
+
+  <name>Apache Tika handlers</name>
+  <packaging>pom</packaging>
+
+  <modules>
+    <module>tika-handler-boilerpipe</module>
+  </modules>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-handlers/tika-handler-boilerpipe/pom.xml
similarity index 51%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
rename to tika-handlers/tika-handler-boilerpipe/pom.xml
index 82fb00a47..05d0b69b3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
+++ b/tika-handlers/tika-handler-boilerpipe/pom.xml
@@ -1,4 +1,5 @@
-<!---
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
@@ -16,7 +17,24 @@
   specific language governing permissions and limitations
   under the License.
 -->
-This module only contains the BoilerPipeContentHandler.  The boilerpipe dependency is no 
-longer maintained and contains clashes with NekoHTML.
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-handlers</artifactId>
+    <version>3.0.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
 
-In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar.
\ No newline at end of file
+  <artifactId>tika-handler-boilerpipe</artifactId>
+
+  <dependencies>
+    <dependency>
+      <groupId>de.l3s.boilerpipe</groupId>
+      <artifactId>boilerpipe</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
similarity index 100%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
rename to tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
index 5fb547f4e..6b163ea3e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
@@ -44,7 +44,6 @@
     </dependency>
   </dependencies>
   <modules>
-    <module>tika-parser-html-commons</module>
     <module>tika-parser-jdbc-commons</module>
     <module>tika-parser-digest-commons</module>
     <module>tika-parser-mail-commons</module>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
deleted file mode 100644
index 7e7a403bc..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
+++ /dev/null
@@ -1,74 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <parent>
-    <artifactId>tika-parsers-standard-modules</artifactId>
-    <groupId>org.apache.tika</groupId>
-    <version>3.0.0-SNAPSHOT</version>
-  </parent>
-  <modelVersion>4.0.0</modelVersion>
-
-  <artifactId>tika-parser-html-commons</artifactId>
-  <name>Apache Tika html commons</name>
-
-  <dependencies>
-    <dependency>
-      <groupId>de.l3s.boilerpipe</groupId>
-      <artifactId>boilerpipe</artifactId>
-      <version>${boilerpipe.version}</version>
-    </dependency>
-  </dependencies>
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-jar-plugin</artifactId>
-        <configuration>
-          <archive>
-            <manifestEntries>
-              <Automatic-Module-Name>org.apache.tika.sax.boilerpipe</Automatic-Module-Name>
-            </manifestEntries>
-          </archive>
-        </configuration>
-        <executions>
-          <execution>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.rat</groupId>
-        <artifactId>apache-rat-plugin</artifactId>
-        <version>${rat.version}</version>
-        <configuration>
-          <excludes>
-            <exclude>README.md</exclude>
-          </excludes>
-        </configuration>
-      </plugin>
-    </plugins>
-  </build>
-
-  <scm>
-    <tag>2.2.1-rc2</tag>
-  </scm>
-</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 4de5eeec4..cb23c96d7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -186,7 +186,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml
index b6794abb7..69a88523e 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -54,7 +54,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index a6b2f9b72..c38c40f50 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -50,8 +50,8 @@
       </exclusions>
     </dependency>
     <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
@@ -128,7 +128,7 @@
                   <exclude>org.apache.tika:tika-parsers-standard-package:jar:</exclude>
                   <exclude>org.apache.tika:tika-serialization:jar:</exclude>
                   <exclude>org.apache.tika:tika-langdetect-optimaize:jar:</exclude>
-                  <exclude>org.apache.tika:tika-parser-html-commons:jar:</exclude>
+                  <exclude>org.apache.tika:tika-handler-boilerpipe:jar:</exclude>
                   <exclude>org.apache.tika:tika-parser-digest-commons:jar:</exclude>
                   <exclude>org.apache.tika:tika-parser-zip-commons:jar:</exclude>
                   <exclude>commons-codec:commons-codec:jar:</exclude>