You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/22 14:32:02 UTC

[tika] branch TIKA-4138 created (now cbc46ee9b)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4138
in repository https://gitbox.apache.org/repos/asf/tika.git


      at cbc46ee9b TIKA-4138 -- move BoilerpipeContentHandler

This branch includes the following new commits:

     new cbc46ee9b TIKA-4138 -- move BoilerpipeContentHandler

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4138 -- move BoilerpipeContentHandler

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4138
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cbc46ee9b5295bf14541da8d1f016261c5e30196
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 22 10:31:47 2023 -0400

    TIKA-4138 -- move BoilerpipeContentHandler
---
 CHANGES.txt                                        |  5 ++
 pom.xml                                            |  1 +
 tika-app/pom.xml                                   |  2 +-
 tika-bom/pom.xml                                   |  2 +-
 tika-bundles/tika-bundle-standard/pom.xml          |  2 +-
 tika-handlers/README.md                            |  2 +
 tika-handlers/pom.xml                              | 48 ++++++++++++++
 .../tika-handler-boilerpipe/pom.xml                | 26 ++++++--
 .../sax/boilerpipe/BoilerpipeContentHandler.java   |  0
 .../tika-parsers-standard-modules/pom.xml          |  1 -
 .../tika-parser-html-commons/pom.xml               | 74 ----------------------
 .../tika-parsers-standard-package/pom.xml          |  2 +-
 tika-server/tika-server-core/pom.xml               |  2 +-
 tika-server/tika-server-standard/pom.xml           |  6 +-
 14 files changed, 86 insertions(+), 87 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 30c137609..408e42676 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,12 @@
 Release 3.0.0-BETA - ??
 
+   BREAKING CHANGES
+
    * Require Java 11 (TIKA-4128).
 
+   * The boilerpipe handler has been moved to tika-handler-boiler-pipe
+
+   Other Changes/Updates
    * Fix bug in DateUtils that stripped timezone information from
      incoming Calendar objects (TIKA-4126).
 
diff --git a/pom.xml b/pom.xml
index ab6b22afa..31f025576 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,6 +54,7 @@
     <module>tika-example</module>
     <module>tika-java7</module>
     <module>tika-detectors</module>
+    <module>tika-handlers</module>
   </modules>
 
   <profiles>
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 9a48d2ea9..68ac79477 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -45,7 +45,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index ba2e19d73..5e1aca01e 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -222,7 +222,7 @@
       </dependency>
       <dependency>
         <groupId>org.apache.tika</groupId>
-        <artifactId>tika-parser-html-commons</artifactId>
+        <artifactId>tika-handler-boilerpipe</artifactId>
         <version>3.0.0-SNAPSHOT</version>
       </dependency>
       <dependency>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index db605c044..1e18b1cb0 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -58,7 +58,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-handlers/README.md b/tika-handlers/README.md
new file mode 100644
index 000000000..bb45651b3
--- /dev/null
+++ b/tika-handlers/README.md
@@ -0,0 +1,2 @@
+This package is intended to hold non-standard handlers. These may have dependencies that some don't want, 
+or they may have a focus that isn't general enough to warrant adding them to tika-core
\ No newline at end of file
diff --git a/tika-handlers/pom.xml b/tika-handlers/pom.xml
new file mode 100644
index 000000000..fcab3eb20
--- /dev/null
+++ b/tika-handlers/pom.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parent</artifactId>
+    <version>3.0.0-SNAPSHOT</version>
+    <relativePath>../tika-parent/pom.xml</relativePath>
+  </parent>
+
+  <artifactId>tika-handlers</artifactId>
+
+  <name>Apache Tika handlers</name>
+  <packaging>pom</packaging>
+
+  <modules>
+    <module>tika-handler-boilerpipe</module>
+  </modules>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-handlers/tika-handler-boilerpipe/pom.xml
similarity index 51%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
rename to tika-handlers/tika-handler-boilerpipe/pom.xml
index 82fb00a47..05d0b69b3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
+++ b/tika-handlers/tika-handler-boilerpipe/pom.xml
@@ -1,4 +1,5 @@
-<!---
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
@@ -16,7 +17,24 @@
   specific language governing permissions and limitations
   under the License.
 -->
-This module only contains the BoilerPipeContentHandler.  The boilerpipe dependency is no 
-longer maintained and contains clashes with NekoHTML.
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-handlers</artifactId>
+    <version>3.0.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
 
-In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar.
\ No newline at end of file
+  <artifactId>tika-handler-boilerpipe</artifactId>
+
+  <dependencies>
+    <dependency>
+      <groupId>de.l3s.boilerpipe</groupId>
+      <artifactId>boilerpipe</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
similarity index 100%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
rename to tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
index 5fb547f4e..6b163ea3e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
@@ -44,7 +44,6 @@
     </dependency>
   </dependencies>
   <modules>
-    <module>tika-parser-html-commons</module>
     <module>tika-parser-jdbc-commons</module>
     <module>tika-parser-digest-commons</module>
     <module>tika-parser-mail-commons</module>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
deleted file mode 100644
index 7e7a403bc..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
+++ /dev/null
@@ -1,74 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <parent>
-    <artifactId>tika-parsers-standard-modules</artifactId>
-    <groupId>org.apache.tika</groupId>
-    <version>3.0.0-SNAPSHOT</version>
-  </parent>
-  <modelVersion>4.0.0</modelVersion>
-
-  <artifactId>tika-parser-html-commons</artifactId>
-  <name>Apache Tika html commons</name>
-
-  <dependencies>
-    <dependency>
-      <groupId>de.l3s.boilerpipe</groupId>
-      <artifactId>boilerpipe</artifactId>
-      <version>${boilerpipe.version}</version>
-    </dependency>
-  </dependencies>
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-jar-plugin</artifactId>
-        <configuration>
-          <archive>
-            <manifestEntries>
-              <Automatic-Module-Name>org.apache.tika.sax.boilerpipe</Automatic-Module-Name>
-            </manifestEntries>
-          </archive>
-        </configuration>
-        <executions>
-          <execution>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.rat</groupId>
-        <artifactId>apache-rat-plugin</artifactId>
-        <version>${rat.version}</version>
-        <configuration>
-          <excludes>
-            <exclude>README.md</exclude>
-          </excludes>
-        </configuration>
-      </plugin>
-    </plugins>
-  </build>
-
-  <scm>
-    <tag>2.2.1-rc2</tag>
-  </scm>
-</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 4de5eeec4..cb23c96d7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -186,7 +186,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml
index b6794abb7..69a88523e 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -54,7 +54,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index a6b2f9b72..c38c40f50 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -50,8 +50,8 @@
       </exclusions>
     </dependency>
     <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika-parser-html-commons</artifactId>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-handler-boilerpipe</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
@@ -128,7 +128,7 @@
                   <exclude>org.apache.tika:tika-parsers-standard-package:jar:</exclude>
                   <exclude>org.apache.tika:tika-serialization:jar:</exclude>
                   <exclude>org.apache.tika:tika-langdetect-optimaize:jar:</exclude>
-                  <exclude>org.apache.tika:tika-parser-html-commons:jar:</exclude>
+                  <exclude>org.apache.tika:tika-handler-boilerpipe:jar:</exclude>
                   <exclude>org.apache.tika:tika-parser-digest-commons:jar:</exclude>
                   <exclude>org.apache.tika:tika-parser-zip-commons:jar:</exclude>
                   <exclude>commons-codec:commons-codec:jar:</exclude>