You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/13 18:32:21 UTC
[tika] 01/01: TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3641
in repository https://gitbox.apache.org/repos/asf/tika.git
commit e6cc939fff3eb13c692dcc633ada9646bf4ebca6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Sep 13 14:32:05 2023 -0400
TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11
---
tika-eval/tika-eval-app/pom.xml | 4 +--
.../eval/app/tools/SlowCompositeReaderWrapper.java | 41 ++++++++++++++++++++++
tika-eval/tika-eval-core/pom.xml | 4 +--
.../core/tokens/AlphaIdeographFilterFactory.java | 11 ++++--
.../eval/core/tokens/AnalyzerDeserializer.java | 4 +--
.../tokens/CJKBigramAwareLengthFilterFactory.java | 8 ++++-
.../tokens/URLEmailNormalizingFilterFactory.java | 9 +++--
... org.apache.lucene.analysis.TokenFilterFactory} | 0
tika-parent/pom.xml | 12 +++----
tika-server/tika-server-eval/pom.xml | 4 +--
10 files changed, 76 insertions(+), 21 deletions(-)
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 2b5f91cee..52151bed4 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -96,8 +96,8 @@
<exclude>org.apache.commons:commons-lang3:jar:</exclude>
<exclude>org.apache.commons:commons-math3:jar:</exclude>
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
- <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
- <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+ <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+ <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
<exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
index f6fc8e332..778e74367 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
@@ -22,6 +22,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.CompositeReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
@@ -29,6 +30,7 @@ import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafMetaData;
import org.apache.lucene.index.LeafReader;
@@ -45,7 +47,10 @@ import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.StoredFields;
+import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
@@ -264,12 +269,43 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
return MultiDocValues.getNormValues(in, field); // TODO cache?
}
+ @Override
+ public FloatVectorValues getFloatVectorValues(String s) throws IOException {
+ //TODO figure out how to implement this... if needed
+ return null;
+ }
+
+ @Override
+ public ByteVectorValues getByteVectorValues(String s) throws IOException {
+ //TODO figure out how to implement this... if needed
+ return null;
+ }
+
+ @Override
+ public TopDocs searchNearestVectors(String s, float[] floats, int i, Bits bits, int i1)
+ throws IOException {
+ //TODO figure out how to implement this... if needed
+ return null;
+ }
+
+ @Override
+ public TopDocs searchNearestVectors(String s, byte[] bytes, int i, Bits bits, int i1)
+ throws IOException {
+ //TODO figure out how to implement this... if needed
+ return null;
+ }
+
@Override
public Fields getTermVectors(int docID) throws IOException {
ensureOpen();
return in.getTermVectors(docID);
}
+ @Override
+ public TermVectors termVectors() throws IOException {
+ return in.termVectors();
+ }
+
@Override
public int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
@@ -288,6 +324,11 @@ public final class SlowCompositeReaderWrapper extends LeafReader {
in.document(docID, visitor);
}
+ @Override
+ public StoredFields storedFields() throws IOException {
+ return in.storedFields();
+ }
+
@Override
public Bits getLiveDocs() {
ensureOpen();
diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml
index 4ee5c0a75..c619a1501 100644
--- a/tika-eval/tika-eval-core/pom.xml
+++ b/tika-eval/tika-eval-core/pom.xml
@@ -64,12 +64,12 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
+ <artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-icu</artifactId>
+ <artifactId>lucene-analysis-icu</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
index 1c67ce6a2..cd5476ecb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
@@ -20,19 +20,26 @@ import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for filter that only allows tokens with characters that "isAlphabetic" or "isIdeographic" through.
*/
public class AlphaIdeographFilterFactory extends TokenFilterFactory {
+ public static final String NAME = "alphaIdeograph";
+
private static final int UNDERSCORE = (int) '_';
+
+ public AlphaIdeographFilterFactory() {
+ super();
+ }
+
public AlphaIdeographFilterFactory(Map<String, String> args) {
super(args);
}
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
index 5c782011e..a20eafbbb 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
@@ -29,9 +29,9 @@ import java.util.Map;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
-import org.apache.lucene.analysis.util.ClasspathResourceLoader;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.util.ClasspathResourceLoader;
class AnalyzerDeserializer {
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
index 515c4e634..87d88343a 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Creates a very narrowly focused TokenFilter that limits tokens based on length
@@ -35,9 +35,15 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*/
public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory {
+ public static final String NAME = "cjkBigramAwareLength";
+
private final int min;
private final int max;
+ public CJKBigramAwareLengthFilterFactory() {
+ min = 3;
+ max = 20;
+ }
public CJKBigramAwareLengthFilterFactory(Map<String, String> args) {
super(args);
diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
index baef4f72a..4ab2672d1 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
@@ -20,11 +20,11 @@ import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for filter that normalizes urls and emails to __url__ and __email__
@@ -35,11 +35,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*/
public class URLEmailNormalizingFilterFactory extends TokenFilterFactory {
+ public static final String NAME = "urlEmailNormalizing";
+
public static final String URL = "___url___";
public static final String EMAIL = "___email___";
private static final char[] URL_CHARS = URL.toCharArray();
private static final char[] EMAIL_CHARS = EMAIL.toCharArray();
+ public URLEmailNormalizingFilterFactory() {
+ super();
+ }
public URLEmailNormalizingFilterFactory(Map<String, String> args) {
super(args);
}
diff --git a/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
similarity index 100%
rename from tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
rename to tika-eval/tika-eval-core/src/main/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6547b90a8..5f6a1d45a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -335,8 +335,7 @@
<guava.version>32.1.2-jre</guava.version>
<httpcomponents.version>4.5.14</httpcomponents.version>
<httpcore.version>4.4.16</httpcore.version>
- <!-- versions greater than 62.2 are not compatible with icu4j handler in lucene 8.11.1 -->
- <icu4j.version>62.2</icu4j.version>
+ <icu4j.version>73.2</icu4j.version>
<imageio.version>1.4.0</imageio.version>
<jackrabbit.version>2.21.19</jackrabbit.version>
<jackson.version>2.15.2</jackson.version>
@@ -361,8 +360,7 @@
<libpst.version>0.9.3</libpst.version>
<log4j2.version>2.20.0</log4j2.version>
<lombok.version>1.18.20</lombok.version>
- <!-- lucene >= 9.x requires Java 11 -->
- <lucene.version>8.11.2</lucene.version>
+ <lucene.version>9.7.0</lucene.version>
<metadata.extractor.version>2.18.0</metadata.extractor.version>
<microsoft.translator.version>0.6.2</microsoft.translator.version>
<mime4j.version>0.8.9</mime4j.version>
@@ -773,12 +771,12 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
+ <artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-icu</artifactId>
+ <artifactId>lucene-analysis-icu</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
@@ -1010,8 +1008,6 @@
<version>1.53.0</version>
</dependency>
<exclude>
- <!-- CVE-2018-18928 does affect the java library not just the c/c++ library,
- upon further research -->
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${icu4j.version}</version>
diff --git a/tika-server/tika-server-eval/pom.xml b/tika-server/tika-server-eval/pom.xml
index c41f5c5a9..1af100100 100644
--- a/tika-server/tika-server-eval/pom.xml
+++ b/tika-server/tika-server-eval/pom.xml
@@ -67,8 +67,8 @@
<exclude>org.apache.commons:commons-lang3:jar:</exclude>
<exclude>org.apache.commons:commons-math3:jar:</exclude>
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
- <exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
- <exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
+ <exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
+ <exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
<exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>