You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/22 14:32:03 UTC
[tika] 01/01: TIKA-4138 -- move BoilerpipeContentHandler
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4138
in repository https://gitbox.apache.org/repos/asf/tika.git
commit cbc46ee9b5295bf14541da8d1f016261c5e30196
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 22 10:31:47 2023 -0400
TIKA-4138 -- move BoilerpipeContentHandler
---
CHANGES.txt | 5 ++
pom.xml | 1 +
tika-app/pom.xml | 2 +-
tika-bom/pom.xml | 2 +-
tika-bundles/tika-bundle-standard/pom.xml | 2 +-
tika-handlers/README.md | 2 +
tika-handlers/pom.xml | 48 ++++++++++++++
.../tika-handler-boilerpipe/pom.xml | 26 ++++++--
.../sax/boilerpipe/BoilerpipeContentHandler.java | 0
.../tika-parsers-standard-modules/pom.xml | 1 -
.../tika-parser-html-commons/pom.xml | 74 ----------------------
.../tika-parsers-standard-package/pom.xml | 2 +-
tika-server/tika-server-core/pom.xml | 2 +-
tika-server/tika-server-standard/pom.xml | 6 +-
14 files changed, 86 insertions(+), 87 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 30c137609..408e42676 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,12 @@
Release 3.0.0-BETA - ??
+ BREAKING CHANGES
+
* Require Java 11 (TIKA-4128).
+ * The boilerpipe handler has been moved to tika-handler-boiler-pipe
+
+ Other Changes/Updates
* Fix bug in DateUtils that stripped timezone information from
incoming Calendar objects (TIKA-4126).
diff --git a/pom.xml b/pom.xml
index ab6b22afa..31f025576 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,6 +54,7 @@
<module>tika-example</module>
<module>tika-java7</module>
<module>tika-detectors</module>
+ <module>tika-handlers</module>
</modules>
<profiles>
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 9a48d2ea9..68ac79477 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -45,7 +45,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index ba2e19d73..5e1aca01e 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -222,7 +222,7 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>3.0.0-SNAPSHOT</version>
</dependency>
<dependency>
diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml
index db605c044..1e18b1cb0 100644
--- a/tika-bundles/tika-bundle-standard/pom.xml
+++ b/tika-bundles/tika-bundle-standard/pom.xml
@@ -58,7 +58,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
diff --git a/tika-handlers/README.md b/tika-handlers/README.md
new file mode 100644
index 000000000..bb45651b3
--- /dev/null
+++ b/tika-handlers/README.md
@@ -0,0 +1,2 @@
+This package is intended to hold non-standard handlers. These may have dependencies that some don't want,
+or they may have a focus that isn't general enough to warrant adding them to tika-core
\ No newline at end of file
diff --git a/tika-handlers/pom.xml b/tika-handlers/pom.xml
new file mode 100644
index 000000000..fcab3eb20
--- /dev/null
+++ b/tika-handlers/pom.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>3.0.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-handlers</artifactId>
+
+ <name>Apache Tika handlers</name>
+ <packaging>pom</packaging>
+
+ <modules>
+ <module>tika-handler-boilerpipe</module>
+ </modules>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md b/tika-handlers/tika-handler-boilerpipe/pom.xml
similarity index 51%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
rename to tika-handlers/tika-handler-boilerpipe/pom.xml
index 82fb00a47..05d0b69b3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/README.md
+++ b/tika-handlers/tika-handler-boilerpipe/pom.xml
@@ -1,4 +1,5 @@
-<!---
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
@@ -16,7 +17,24 @@
specific language governing permissions and limitations
under the License.
-->
-This module only contains the BoilerPipeContentHandler. The boilerpipe dependency is no
-longer maintained and contains clashes with NekoHTML.
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-handlers</artifactId>
+ <version>3.0.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
-In Tika 3.x, we should rename this module to tika-handler-boilerpipe or similar.
\ No newline at end of file
+ <artifactId>tika-handler-boilerpipe</artifactId>
+
+ <dependencies>
+ <dependency>
+ <groupId>de.l3s.boilerpipe</groupId>
+ <artifactId>boilerpipe</artifactId>
+ <version>1.1.0</version>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
similarity index 100%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
rename to tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
index 5fb547f4e..6b163ea3e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
@@ -44,7 +44,6 @@
</dependency>
</dependencies>
<modules>
- <module>tika-parser-html-commons</module>
<module>tika-parser-jdbc-commons</module>
<module>tika-parser-digest-commons</module>
<module>tika-parser-mail-commons</module>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
deleted file mode 100644
index 7e7a403bc..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/pom.xml
+++ /dev/null
@@ -1,74 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
- <parent>
- <artifactId>tika-parsers-standard-modules</artifactId>
- <groupId>org.apache.tika</groupId>
- <version>3.0.0-SNAPSHOT</version>
- </parent>
- <modelVersion>4.0.0</modelVersion>
-
- <artifactId>tika-parser-html-commons</artifactId>
- <name>Apache Tika html commons</name>
-
- <dependencies>
- <dependency>
- <groupId>de.l3s.boilerpipe</groupId>
- <artifactId>boilerpipe</artifactId>
- <version>${boilerpipe.version}</version>
- </dependency>
- </dependencies>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestEntries>
- <Automatic-Module-Name>org.apache.tika.sax.boilerpipe</Automatic-Module-Name>
- </manifestEntries>
- </archive>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <version>${rat.version}</version>
- <configuration>
- <excludes>
- <exclude>README.md</exclude>
- </excludes>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <scm>
- <tag>2.2.1-rc2</tag>
- </scm>
-</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 4de5eeec4..cb23c96d7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -186,7 +186,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml
index b6794abb7..69a88523e 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -54,7 +54,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index a6b2f9b72..c38c40f50 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -50,8 +50,8 @@
</exclusions>
</dependency>
<dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-html-commons</artifactId>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-handler-boilerpipe</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
@@ -128,7 +128,7 @@
<exclude>org.apache.tika:tika-parsers-standard-package:jar:</exclude>
<exclude>org.apache.tika:tika-serialization:jar:</exclude>
<exclude>org.apache.tika:tika-langdetect-optimaize:jar:</exclude>
- <exclude>org.apache.tika:tika-parser-html-commons:jar:</exclude>
+ <exclude>org.apache.tika:tika-handler-boilerpipe:jar:</exclude>
<exclude>org.apache.tika:tika-parser-digest-commons:jar:</exclude>
<exclude>org.apache.tika:tika-parser-zip-commons:jar:</exclude>
<exclude>commons-codec:commons-codec:jar:</exclude>