You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/13 18:32:20 UTC

[tika] branch TIKA-3641 created (now e6cc939ff)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3641
in repository https://gitbox.apache.org/repos/asf/tika.git


      at e6cc939ff TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11

This branch includes the following new commits:

     new e6cc939ff TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3641
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6cc939fff3eb13c692dcc633ada9646bf4ebca6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Sep 13 14:32:05 2023 -0400

    TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11
---
 tika-eval/tika-eval-app/pom.xml                    |  4 +--
 .../eval/app/tools/SlowCompositeReaderWrapper.java | 41 ++++++++++++++++++++++
 tika-eval/tika-eval-core/pom.xml                   |  4 +--
 .../core/tokens/AlphaIdeographFilterFactory.java   | 11 ++++--
 .../eval/core/tokens/AnalyzerDeserializer.java     |  4 +--
 .../tokens/CJKBigramAwareLengthFilterFactory.java  |  8 ++++-
 .../tokens/URLEmailNormalizingFilterFactory.java   |  9 +++--
 ... org.apache.lucene.analysis.TokenFilterFactory} |  0
 tika-parent/pom.xml                                | 12 +++----
 tika-server/tika-server-eval/pom.xml               |  4 +--
 10 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 2b5f91cee..52151bed4 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -96,8 +96,8 @@
                   <exclude>org.apache.commons:commons-lang3:jar:</exclude>
                   <exclude>org.apache.commons:commons-math3:jar:</exclude>
                   <exclude>org.apache.lucene:lucene-core:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
                   <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
                   <exclude>com.ibm.icu:icu4j:jar:</exclude>
                   <exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
index f6fc8e332..778e74367 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
@@ -22,6 +22,7 @@ import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.ByteVectorValues;
 import org.apache.lucene.index.CompositeReader;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.DocValues;
@@ -29,6 +30,7 @@ import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafMetaData;
 import org.apache.lucene.index.LeafReader;
@@ -45,7 +47,10 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.StoredFields;
+import org.apache.lucene.index.TermVectors;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.Version;
 
@@ -264,12 +269,43 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
         return MultiDocValues.getNormValues(in, field); // TODO cache?
     }
 
+    @Override
+    public FloatVectorValues getFloatVectorValues(String s) throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public ByteVectorValues getByteVectorValues(String s) throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public TopDocs searchNearestVectors(String s, float[] floats, int i, Bits bits, int i1)
+            throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
+    @Override
+    public TopDocs searchNearestVectors(String s, byte[] bytes, int i, Bits bits, int i1)
+            throws IOException {
+        //TODO figure out how to implement this... if needed
+        return null;
+    }
+
     @Override
     public Fields getTermVectors(int docID) throws IOException {
         ensureOpen();
         return in.getTermVectors(docID);
     }
 
+    @Override
+    public TermVectors termVectors() throws IOException {
+        return in.termVectors();
+    }
+
     @Override
     public int numDocs() {
         // Don't call ensureOpen() here (it could affect performance)
@@ -288,6 +324,11 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
         in.document(docID, visitor);
     }
 
+    @Override
+    public StoredFields storedFields() throws IOException {
+        return in.storedFields();
+    }
+
     @Override
     public Bits getLiveDocs() {
         ensureOpen();
diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml
index 4ee5c0a75..c619a1501 100644
--- a/tika-eval/tika-eval-core/pom.xml
+++ b/tika-eval/tika-eval-core/pom.xml
@@ -64,12 +64,12 @@
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers-common</artifactId>
+      <artifactId>lucene-analysis-common</artifactId>
       <version>${lucene.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers-icu</artifactId>
+      <artifactId>lucene-analysis-icu</artifactId>
       <version>${lucene.version}</version>
     </dependency>
     <dependency>
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
index 1c67ce6a2..cd5476ecb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
@@ -20,19 +20,26 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Factory for filter that only allows tokens with characters that "isAlphabetic"  or "isIdeographic" through.
  */
 public class AlphaIdeographFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "alphaIdeograph";
+
     private static final int UNDERSCORE = (int) '_';
 
+
+    public AlphaIdeographFilterFactory() {
+        super();
+    }
+
     public AlphaIdeographFilterFactory(Map<String, String> args) {
         super(args);
     }
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
index 5c782011e..a20eafbbb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
@@ -29,9 +29,9 @@ import java.util.Map;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
-import org.apache.lucene.analysis.util.ClasspathResourceLoader;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.ClasspathResourceLoader;
 
 class AnalyzerDeserializer {
 
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
index 515c4e634..87d88343a 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Creates a very narrowly focused TokenFilter that limits tokens based on length
@@ -35,9 +35,15 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
  */
 public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "cjkBigramAwareLength";
+
 
     private final int min;
     private final int max;
+    public CJKBigramAwareLengthFilterFactory() {
+        min = 3;
+        max = 20;
+    }
 
     public CJKBigramAwareLengthFilterFactory(Map<String, String> args) {
         super(args);
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
index baef4f72a..4ab2672d1 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**
  * Factory for filter that normalizes urls and emails to __url__ and __email__
@@ -35,11 +35,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
  */
 public class URLEmailNormalizingFilterFactory extends TokenFilterFactory {
 
+    public static final String NAME = "urlEmailNormalizing";
+
     public static final String URL = "___url___";
     public static final String EMAIL = "___email___";
     private static final char[] URL_CHARS = URL.toCharArray();
     private static final char[] EMAIL_CHARS = EMAIL.toCharArray();
 
+    public URLEmailNormalizingFilterFactory() {
+        super();
+    }
     public URLEmailNormalizingFilterFactory(Map<String, String> args) {
         super(args);
     }
diff --git a/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
similarity index 100%
rename from tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
rename to tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6547b90a8..5f6a1d45a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -335,8 +335,7 @@
     <guava.version>32.1.2-jre</guava.version>
     <httpcomponents.version>4.5.14</httpcomponents.version>
     <httpcore.version>4.4.16</httpcore.version>
-    <!-- versions greater than 62.2 are not compatible with icu4j handler in lucene 8.11.1 -->
-    <icu4j.version>62.2</icu4j.version>
+    <icu4j.version>73.2</icu4j.version>
     <imageio.version>1.4.0</imageio.version>
     <jackrabbit.version>2.21.19</jackrabbit.version>
     <jackson.version>2.15.2</jackson.version>
@@ -361,8 +360,7 @@
     <libpst.version>0.9.3</libpst.version>
     <log4j2.version>2.20.0</log4j2.version>
     <lombok.version>1.18.20</lombok.version>
-    <!-- lucene >= 9.x requires Java 11 -->
-    <lucene.version>8.11.2</lucene.version>
+    <lucene.version>9.7.0</lucene.version>
     <metadata.extractor.version>2.18.0</metadata.extractor.version>
     <microsoft.translator.version>0.6.2</microsoft.translator.version>
     <mime4j.version>0.8.9</mime4j.version>
@@ -773,12 +771,12 @@
       </dependency>
       <dependency>
         <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-analyzers-common</artifactId>
+        <artifactId>lucene-analysis-common</artifactId>
         <version>${lucene.version}</version>
       </dependency>
       <dependency>
         <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-analyzers-icu</artifactId>
+        <artifactId>lucene-analysis-icu</artifactId>
         <version>${lucene.version}</version>
       </dependency>
       <dependency>
@@ -1010,8 +1008,6 @@
               <version>1.53.0</version>
             </dependency>
             <exclude>
-              <!-- CVE-2018-18928 does affect the java library not just the c/c++ library,
-              upon further research -->
               <groupId>com.ibm.icu</groupId>
               <artifactId>icu4j</artifactId>
               <version>${icu4j.version}</version>
diff --git a/tika-server/tika-server-eval/pom.xml b/tika-server/tika-server-eval/pom.xml
index c41f5c5a9..1af100100 100644
--- a/tika-server/tika-server-eval/pom.xml
+++ b/tika-server/tika-server-eval/pom.xml
@@ -67,8 +67,8 @@
                   <exclude>org.apache.commons:commons-lang3:jar:</exclude>
                   <exclude>org.apache.commons:commons-math3:jar:</exclude>
                   <exclude>org.apache.lucene:lucene-core:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
-                  <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+                  <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
                   <exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
                   <exclude>com.ibm.icu:icu4j:jar:</exclude>
                   <exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>