You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/13 18:32:21 UTC

[tika] 01/01: TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3641
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6cc939fff3eb13c692dcc633ada9646bf4ebca6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Sep 13 14:32:05 2023 -0400

    TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11
---
 tika-eval/tika-eval-app/pom.xml                    |  4 +--
 .../eval/app/tools/SlowCompositeReaderWrapper.java | 41 ++++++++++++++++++++++
 tika-eval/tika-eval-core/pom.xml                   |  4 +--
 .../core/tokens/AlphaIdeographFilterFactory.java   | 11 ++++--
 .../eval/core/tokens/AnalyzerDeserializer.java     |  4 +--
 .../tokens/CJKBigramAwareLengthFilterFactory.java  |  8 ++++-
 .../tokens/URLEmailNormalizingFilterFactory.java   |  9 +++--
 ... org.apache.lucene.analysis.TokenFilterFactory} |  0
 tika-parent/pom.xml                                | 12 +++----
 tika-server/tika-server-eval/pom.xml               |  4 +--
 10 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 2b5f91cee..52151bed4 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -96,8 +96,8 @@
                   <exclude>org.apache.commons:commons-lang3:jar:</exclude>
                   <exclude>org.apache.commons:commons-math3:jar:</exclude>
                   <exclude>org.apache.lucene:lucene-core:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
                   <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
                   <exclude>com.ibm.icu:icu4j:jar:</exclude>
                   <exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
index f6fc8e332..778e74367 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
@@ -22,6 +22,7 @@ import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.ByteVectorValues;
 import org.apache.lucene.index.CompositeReader;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.DocValues;
@@ -29,6 +30,7 @@ import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafMetaData;
 import org.apache.lucene.index.LeafReader;
@@ -45,7 +47,10 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.StoredFields;
+import org.apache.lucene.index.TermVectors;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.Version;
 
@@ -264,12 +269,43 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
         return MultiDocValues.getNormValues(in, field); // TODO cache?
     }
 
+    @Override
+    public FloatVectorValues getFloatVectorValues(String s) throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public ByteVectorValues getByteVectorValues(String s) throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public TopDocs searchNearestVectors(String s, float[] floats, int i, Bits bits, int i1)
+            throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public TopDocs searchNearestVectors(String s, byte[] bytes, int i, Bits bits, int i1)
+            throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
     @Override
     public Fields getTermVectors(int docID) throws IOException {
         ensureOpen();
         return in.getTermVectors(docID);
     }
 
+    @Override
+    public TermVectors termVectors() throws IOException {
+        return in.termVectors();
+    }
+
     @Override
     public int numDocs() {
         // Don't call ensureOpen() here (it could affect performance)
@@ -288,6 +324,11 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
         in.document(docID, visitor);
     }
 
+    @Override
+    public StoredFields storedFields() throws IOException {
+        return in.storedFields();
+    }
+
     @Override
     public Bits getLiveDocs() {
         ensureOpen();
diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml
index 4ee5c0a75..c619a1501 100644
--- a/tika-eval/tika-eval-core/pom.xml
+++ b/tika-eval/tika-eval-core/pom.xml
@@ -64,12 +64,12 @@
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers-common</artifactId>
+      <artifactId>lucene-analysis-common</artifactId>
       <version>${lucene.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers-icu</artifactId>
+      <artifactId>lucene-analysis-icu</artifactId>
       <version>${lucene.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
index 1c67ce6a2..cd5476ecb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
@@ -20,19 +20,26 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Factory for filter that only allows tokens with characters that "isAlphabetic"  or "isIdeographic" through.
  */
 public class AlphaIdeographFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "alphaIdeograph";
+
     private static final int UNDERSCORE = (int) '_';
 
+
+    public AlphaIdeographFilterFactory() {
+        super();
+    }
+
     public AlphaIdeographFilterFactory(Map<String, String> args) {
         super(args);
     }
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
index 5c782011e..a20eafbbb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
@@ -29,9 +29,9 @@ import java.util.Map;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
-import org.apache.lucene.analysis.util.ClasspathResourceLoader;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.ClasspathResourceLoader;
 
 class AnalyzerDeserializer {
 
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
index 515c4e634..87d88343a 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Creates a very narrowly focused TokenFilter that limits tokens based on length
@@ -35,9 +35,15 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
  */
 public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "cjkBigramAwareLength";
+
 
     private final int min;
     private final int max;
+    public CJKBigramAwareLengthFilterFactory() {
+        min = 3;
+        max = 20;
+    }
 
     public CJKBigramAwareLengthFilterFactory(Map<String, String> args) {
         super(args);
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
index baef4f72a..4ab2672d1 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Factory for filter that normalizes urls and emails to __url__ and __email__
@@ -35,11 +35,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
  */
 public class URLEmailNormalizingFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "urlEmailNormalizing";
+
     public static final String URL = "___url___";
     public static final String EMAIL = "___email___";
     private static final char[] URL_CHARS = URL.toCharArray();
     private static final char[] EMAIL_CHARS = EMAIL.toCharArray();
 
+    public URLEmailNormalizingFilterFactory() {
+        super();
+    }
     public URLEmailNormalizingFilterFactory(Map<String, String> args) {
         super(args);
     }
diff --git a/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
similarity index 100%
rename from tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
rename to tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6547b90a8..5f6a1d45a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -335,8 +335,7 @@
     <guava.version>32.1.2-jre</guava.version>
     <httpcomponents.version>4.5.14</httpcomponents.version>
     <httpcore.version>4.4.16</httpcore.version>
-    <!-- versions greater than 62.2 are not compatible with icu4j handler in lucene 8.11.1 -->
-    <icu4j.version>62.2</icu4j.version>
+    <icu4j.version>73.2</icu4j.version>
     <imageio.version>1.4.0</imageio.version>
     <jackrabbit.version>2.21.19</jackrabbit.version>
     <jackson.version>2.15.2</jackson.version>
@@ -361,8 +360,7 @@
     <libpst.version>0.9.3</libpst.version>
     <log4j2.version>2.20.0</log4j2.version>
     <lombok.version>1.18.20</lombok.version>
-    <!-- lucene >= 9.x requires Java 11 -->
-    <lucene.version>8.11.2</lucene.version>
+    <lucene.version>9.7.0</lucene.version>
     <metadata.extractor.version>2.18.0</metadata.extractor.version>
     <microsoft.translator.version>0.6.2</microsoft.translator.version>
     <mime4j.version>0.8.9</mime4j.version>
@@ -773,12 +771,12 @@
       </dependency>
       <dependency>
         <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-analyzers-common</artifactId>
+        <artifactId>lucene-analysis-common</artifactId>
         <version>${lucene.version}</version>
       </dependency>
       <dependency>
         <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-analyzers-icu</artifactId>
+        <artifactId>lucene-analysis-icu</artifactId>
         <version>${lucene.version}</version>
       </dependency>
       <dependency>
@@ -1010,8 +1008,6 @@
               <version>1.53.0</version>
             </dependency>
             <exclude>
-              <!-- CVE-2018-18928 does affect the java library not just the c/c++ library,
-              upon further research -->
               <groupId>com.ibm.icu</groupId>
               <artifactId>icu4j</artifactId>
               <version>${icu4j.version}</version>
diff --git a/tika-server/tika-server-eval/pom.xml b/tika-server/tika-server-eval/pom.xml
index c41f5c5a9..1af100100 100644
--- a/tika-server/tika-server-eval/pom.xml
+++ b/tika-server/tika-server-eval/pom.xml
@@ -67,8 +67,8 @@
                   <exclude>org.apache.commons:commons-lang3:jar:</exclude>
                   <exclude>org.apache.commons:commons-math3:jar:</exclude>
                   <exclude>org.apache.lucene:lucene-core:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
                   <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
                   <exclude>com.ibm.icu:icu4j:jar:</exclude>
                   <exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>