You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/09 18:52:13 UTC
[tika] branch branch_1x updated: TIKA-2695 -- upgrade Lucene to
7.4.0
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 2cdf627 TIKA-2695 -- upgrade Lucene to 7.4.0
2cdf627 is described below
commit 2cdf627eb854dee878a81fec2c6ededa36132470
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Aug 9 14:40:57 2018 -0400
TIKA-2695 -- upgrade Lucene to 7.4.0
---
tika-eval/pom.xml | 3 +-
.../eval/tokens/AlphaIdeographFilterFactory.java | 2 +-
.../tika/eval/tokens/AnalyzerDeserializer.java | 179 +++---------------
.../tokens/CJKBigramAwareLengthFilterFactory.java | 2 +-
tika-eval/src/main/resources/lucene-analyzers.json | 22 +--
tika-example/pom.xml | 2 +-
.../tika/example/LazyTextExtractorField.java | 210 ---------------------
.../org/apache/tika/example/LuceneIndexer.java | 7 +-
.../apache/tika/example/LuceneIndexerExtended.java | 21 +--
.../tika/example/MetadataAwareLuceneIndexer.java | 7 +-
.../java/org/apache/tika/example/RecentFiles.java | 20 +-
11 files changed, 63 insertions(+), 412 deletions(-)
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 8d21a49..c7d28fd 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -35,8 +35,7 @@
<properties>
<cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
- <!-- upgrade to 6.x or something more modern once Tika requires Java 1.8 -->
- <lucene.version>5.5.4</lucene.version>
+ <lucene.version>7.4.0</lucene.version>
<poi.version>3.17</poi.version>
</properties>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
index 771322b..6991b03 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
@@ -19,9 +19,9 @@ package org.apache.tika.eval.tokens;
import java.io.IOException;
import java.util.Map;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
index 2389309..b9a37b3 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
@@ -18,7 +18,6 @@ package org.apache.tika.eval.tokens;
import java.io.IOException;
-import java.io.Reader;
import java.lang.reflect.Type;
import java.util.Collections;
import java.util.HashMap;
@@ -33,14 +32,10 @@ import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import org.apache.lucene.analysis.util.TokenizerFactory;
class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
@@ -97,17 +92,15 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
}
JsonObject aRoot = (JsonObject)value;
- CharFilterFactory[] charFilters = new CharFilterFactory[0];
- TokenizerFactory tokenizerFactory = null;
- TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
+ CustomAnalyzer.Builder builder = CustomAnalyzer.builder(new ClasspathResourceLoader(AnalyzerDeserializer.class));
for ( Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
String k = e.getKey();
if (k.equals(CHAR_FILTERS)) {
- charFilters = buildCharFilters(e.getValue(), analyzerName);
+ buildCharFilters(e.getValue(), analyzerName, builder);
} else if (k.equals(TOKEN_FILTERS)) {
- tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
+ buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens, builder);
} else if (k.equals(TOKENIZER)) {
- tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
+ buildTokenizerFactory(e.getValue(), analyzerName, builder);
} else if (! k.equals(COMMENT)) {
throw new IllegalArgumentException("Should have one of three values here:"+
CHAR_FILTERS + ", "+
@@ -116,13 +109,11 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
". I don't recognize: "+k);
}
}
- if (tokenizerFactory == null) {
- throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
- }
- return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
+ return builder.build();
}
- private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
+ private static void buildTokenizerFactory(JsonElement map, String analyzerName,
+ CustomAnalyzer.Builder builder) throws IOException {
if (!(map instanceof JsonObject)) {
throw new IllegalArgumentException("Expecting a map with \"factory\" string and " +
"\"params\" map in tokenizer factory;"+
@@ -139,33 +130,13 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
JsonElement paramsEl = ((JsonObject)map).get(PARAMS);
Map<String, String> params = mapify(paramsEl);
- String spiName = "";
- for (String s : TokenizerFactory.availableTokenizers()) {
- Class clazz = TokenizerFactory.lookupClass(s);
- if (clazz.getName().equals(factoryName)) {
- spiName = s;
- break;
- }
- }
- if (spiName.equals("")) {
- throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name"+
- "'"+factoryName+"' does not exist.");
- }
- try {
- TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
- if (tokenizerFactory instanceof ResourceLoaderAware) {
- ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
- }
-
- return tokenizerFactory;
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException("While working on "+analyzerName, e);
- }
+ builder.withTokenizer(factoryName, params);
}
- private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException {
+ private static void buildCharFilters(JsonElement el,
+ String analyzerName, CustomAnalyzer.Builder builder) throws IOException {
if (el == null || el.isJsonNull()) {
- return null;
+ return;
}
if (! el.isJsonArray()) {
throw new IllegalArgumentException("Expecting array for charfilters, but got:"+el.toString() +
@@ -188,40 +159,15 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
Map<String, String> params = mapify(paramsEl);
- String spiName = "";
- for (String s : CharFilterFactory.availableCharFilters()) {
- Class clazz = CharFilterFactory.lookupClass(s);
- if (clazz.getName().equals(factoryName)) {
- spiName = s;
- break;
- }
- }
- if (spiName.equals("")) {
- throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name"+
- "'"+factoryName+"' does not exist.");
- }
-
- try {
- CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params);
- if (charFilterFactory instanceof ResourceLoaderAware) {
- ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
- }
- ret.add(charFilterFactory);
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException("While trying to load "+
- analyzerName + ": "+ e.getMessage(), e);
- }
- }
- if (ret.size() == 0) {
- return new CharFilterFactory[0];
+ builder.addCharFilter(factoryName, params);
}
- return ret.toArray(new CharFilterFactory[ret.size()]);
}
- private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el,
- String analyzerName, int maxTokens) throws IOException {
+ private static void buildTokenFilterFactories(JsonElement el,
+ String analyzerName,
+ int maxTokens, CustomAnalyzer.Builder builder) throws IOException {
if (el == null || el.isJsonNull()) {
- return null;
+ return;
}
if (! el.isJsonArray()) {
throw new IllegalArgumentException(
@@ -242,43 +188,18 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
factoryName = factoryName.startsWith("oala.") ?
factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") :
factoryName;
-
JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
Map<String, String> params = mapify(paramsEl);
- String spiName = "";
- for (String s : TokenFilterFactory.availableTokenFilters()) {
- Class clazz = TokenFilterFactory.lookupClass(s);
- if (clazz.getName().equals(factoryName)) {
- spiName = s;
- break;
- }
- }
- if (spiName.equals("")) {
- throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name"+
- "'"+factoryName+"' does not exist.");
- }
-
- try {
- TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params);
- if (tokenFilterFactory instanceof ResourceLoaderAware) {
- ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
- }
- ret.add(tokenFilterFactory);
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException("While loading "+analyzerName, e);
- }
+ builder.addTokenFilter(factoryName, params);
}
if (maxTokens > -1) {
Map<String, String> m = new HashMap<>();
m.put("maxTokenCount", Integer.toString(maxTokens));
- ret.add(new LimitTokenCountFilterFactory(m));
- }
-
- if (ret.size() == 0) {
- return new TokenFilterFactory[0];
+ builder.addTokenFilter(
+ "limittokencount",
+ m);
}
- return ret.toArray(new TokenFilterFactory[ret.size()]);
}
private static Map<String, String> mapify(JsonElement paramsEl) {
@@ -299,62 +220,4 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
}
return params;
}
-
- /**
- * Plagiarized verbatim from Solr!
- */
- private static class MyTokenizerChain extends Analyzer {
-
- final private CharFilterFactory[] charFilters;
- final private TokenizerFactory tokenizer;
- final private TokenFilterFactory[] filters;
-
- public MyTokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
- this(null, tokenizer, filters);
- }
-
- public MyTokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
- this.charFilters = charFilters;
- this.tokenizer = tokenizer;
- this.filters = filters;
- }
-
- public CharFilterFactory[] getCharFilterFactories() {
- return charFilters;
- }
-
- public TokenizerFactory getTokenizerFactory() {
- return tokenizer;
- }
-
- public TokenFilterFactory[] getTokenFilterFactories() {
- return filters;
- }
-
- @Override
- public Reader initReader(String fieldName, Reader reader) {
-
- if (charFilters != null && charFilters.length > 0) {
- Reader cs = reader;
- for (CharFilterFactory charFilter : charFilters) {
- cs = charFilter.create(cs);
- }
- reader = cs;
- }
-
- return reader;
- }
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tk = tokenizer.create();
- TokenStream ts = tk;
- for (TokenFilterFactory filter : filters) {
- ts = filter.create(ts);
- }
-
- return new TokenStreamComponents(tk, ts);
- }
- }
-
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
index 215ef9c..73c8571 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json
index aa24b79..379bf84 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -3,23 +3,23 @@
"general": {
"charfilters": [
{
- "factory": "oala.charfilter.MappingCharFilterFactory",
+ "factory": "mapping",
"params": {
"mapping": "/lucene-char-mapping.txt"
}
}
],
"tokenizer": {
- "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
+ "factory": "uax29urlemail",
"params": {}
},
"tokenfilters": [
{
- "factory": "oala.icu.ICUFoldingFilterFactory",
+ "factory": "icufolding",
"params": {}
},
{
- "factory": "oala.cjk.CJKBigramFilterFactory",
+ "factory": "cjkbigram",
"params": {
"outputUnigrams": "false"
}
@@ -30,20 +30,20 @@
"_comment" : "Use this analyzer for counting common tokens in a corpus.",
"_comment" : "This isn't used by tika-eval during profiling or comparing",
"tokenizer": {
- "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
+ "factory": "uax29urlemail",
"params": {}
},
"tokenfilters": [
{
- "factory": "oala.icu.ICUFoldingFilterFactory",
+ "factory": "icufolding",
"params": {}
},
{
- "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory",
+ "factory": "alphaideograph",
"params": {}
},
{
- "factory": "oala.pattern.PatternReplaceFilterFactory",
+ "factory": "patternreplace",
"params": {
"pattern": "^[\\w+\\.]{1,30}@(?:\\w+\\.){1,10}\\w+$",
"replacement": "___email___",
@@ -51,7 +51,7 @@
}
},
{
- "factory": "oala.pattern.PatternReplaceFilterFactory",
+ "factory": "patternreplace",
"params": {
"pattern": "^(?:(?:ftp|https?):\\/\\/)?(?:\\w+\\.){1,10}\\w+$",
"replacement": "___url___",
@@ -59,13 +59,13 @@
}
},
{
- "factory": "oala.cjk.CJKBigramFilterFactory",
+ "factory": "cjkbigram",
"params": {
"outputUnigrams": "false"
}
},
{
- "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory",
+ "factory": "cjkbigramawarelength",
"params": {
"min": 4,
"max": 20
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 86f5cee..26ec9ec 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -127,7 +127,7 @@
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
- <version>3.5.0</version>
+ <version>7.4.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
diff --git a/tika-example/src/main/java/org/apache/tika/example/LazyTextExtractorField.java b/tika-example/src/main/java/org/apache/tika/example/LazyTextExtractorField.java
deleted file mode 100755
index 7567bc4..0000000
--- a/tika-example/src/main/java/org/apache/tika/example/LazyTextExtractorField.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.example;
-
-import java.io.InputStream;
-import java.io.Reader;
-import java.util.concurrent.Executor;
-
-import org.apache.jackrabbit.core.query.lucene.FieldNames;
-import org.apache.jackrabbit.core.value.InternalValue;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.document.AbstractField;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.document.Field.TermVector;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * <code>LazyTextExtractorField</code> implements a Lucene field with a String
- * value that is lazily initialized from a given {@link Reader}. In addition
- * this class provides a method to find out whether the purpose of the reader is
- * to extract text and whether the extraction process is already finished.
- *
- * @see #isExtractorFinished()
- */
-@SuppressWarnings("serial")
-public class LazyTextExtractorField extends AbstractField {
- /**
- * The logger instance for this class.
- */
- private static final Logger LOG = LoggerFactory.getLogger(LazyTextExtractorField.class);
-
- /**
- * The exception used to forcibly terminate the extraction process when the
- * maximum field length is reached.
- * <p>
- * Such exceptions shouldn't be used in logging since its stack trace is meaningless.
- */
- private static final SAXException STOP = new SAXException("max field length reached");
-
- /**
- * The extracted text content of the given binary value. Set to non-null
- * when the text extraction task finishes.
- */
- private volatile String extract = null;
-
- /**
- * Creates a new <code>LazyTextExtractorField</code> with the given
- * <code>name</code>.
- *
- * @param name the name of the field.
- * @param reader the reader where to obtain the string from.
- * @param highlighting set to <code>true</code> to enable result highlighting support
- */
- public LazyTextExtractorField(Parser parser, InternalValue value,
- Metadata metadata, Executor executor, boolean highlighting,
- int maxFieldLength) {
- super(FieldNames.FULLTEXT, highlighting ? Store.YES : Store.NO,
- Field.Index.ANALYZED, highlighting ? TermVector.WITH_OFFSETS
- : TermVector.NO);
- executor.execute(new ParsingTask(parser, value, metadata,
- maxFieldLength));
- }
-
- /**
- * Returns the extracted text. This method blocks until the text extraction
- * task has been completed.
- *
- * @return the string value of this field
- */
- public synchronized String stringValue() {
- try {
- while (!isExtractorFinished()) {
- wait();
- }
- return extract;
- } catch (InterruptedException e) {
- LOG.error("Text extraction thread was interrupted", e);
- return "";
- }
- }
-
- /**
- * @return always <code>null</code>
- */
- public Reader readerValue() {
- return null;
- }
-
- /**
- * @return always <code>null</code>
- */
- public byte[] binaryValue() {
- return null;
- }
-
- /**
- * @return always <code>null</code>
- */
- public TokenStream tokenStreamValue() {
- return null;
- }
-
- /**
- * Checks whether the text extraction task has finished.
- *
- * @return <code>true</code> if the extracted text is available
- */
- public boolean isExtractorFinished() {
- return extract != null;
- }
-
- private synchronized void setExtractedText(String value) {
- extract = value;
- notify();
- }
-
- /**
- * Releases all resources associated with this field.
- */
- public void dispose() {
- // TODO: Cause the ContentHandler below to throw an exception
- }
-
- /**
- * The background task for extracting text from a binary value.
- */
- private class ParsingTask extends DefaultHandler implements Runnable {
- private final Parser parser;
-
- private final InternalValue value;
-
- private final Metadata metadata;
-
- private final int maxFieldLength;
-
- private final StringBuilder builder = new StringBuilder();
-
- private final ParseContext context = new ParseContext();
-
- // NOTE: not a part of Jackrabbit code, made
- private final ContentHandler handler = new DefaultHandler();
-
- public ParsingTask(Parser parser, InternalValue value,
- Metadata metadata, int maxFieldLength) {
- this.parser = parser;
- this.value = value;
- this.metadata = metadata;
- this.maxFieldLength = maxFieldLength;
- }
-
- public void run() {
- try {
- try (InputStream stream = value.getStream()) {
- parser.parse(stream, handler, metadata, context);
- }
- } catch (LinkageError e) {
- // Capture and ignore
- } catch (Throwable t) {
- if (t != STOP) {
- LOG.debug("Failed to extract text.", t);
- setExtractedText("TextExtractionError");
- return;
- }
- } finally {
- value.discard();
- }
- setExtractedText(handler.toString());
-
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- builder.append(ch, start,
- Math.min(length, maxFieldLength - builder.length()));
- if (builder.length() >= maxFieldLength) {
- throw STOP;
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length)
- throws SAXException {
- characters(ch, start, length);
- }
- }
-}
diff --git a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java
index 2f7cd31..1885877 100755
--- a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java
+++ b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java
@@ -20,9 +20,8 @@ package org.apache.tika.example;
import java.io.File;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.tika.Tika;
@@ -38,8 +37,8 @@ public class LuceneIndexer {
public void indexDocument(File file) throws Exception {
Document document = new Document();
- document.add(new Field("filename", file.getName(), Store.YES, Index.ANALYZED));
- document.add(new Field("fulltext", tika.parseToString(file), Store.NO, Index.ANALYZED));
+ document.add(new TextField("filename", file.getName(), Store.YES));
+ document.add(new TextField("fulltext", tika.parseToString(file), Store.NO));
writer.addDocument(document);
}
}
diff --git a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java
index 2a7fd13..0692339 100755
--- a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java
+++ b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java
@@ -19,16 +19,15 @@ package org.apache.tika.example;
import java.io.File;
import java.io.Reader;
+import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.FSDirectory;
import org.apache.tika.Tika;
@SuppressWarnings("deprecation")
@@ -43,10 +42,10 @@ public class LuceneIndexerExtended {
}
public static void main(String[] args) throws Exception {
- try (IndexWriter writer = new IndexWriter(
- new SimpleFSDirectory(new File(args[0])),
- new StandardAnalyzer(Version.LUCENE_30),
- MaxFieldLength.UNLIMITED)) {
+ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
+ try (IndexWriter writer =
+ new IndexWriter(FSDirectory.open(Paths.get(args[0])),
+ indexWriterConfig)) {
LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer);
for (int i = 1; i < args.length; i++) {
indexer.indexDocument(new File(args[i]));
@@ -57,8 +56,8 @@ public class LuceneIndexerExtended {
public void indexDocument(File file) throws Exception {
try (Reader fulltext = tika.parse(file)) {
Document document = new Document();
- document.add(new Field("filename", file.getName(), Store.YES, Index.ANALYZED));
- document.add(new Field("fulltext", fulltext));
+ document.add(new TextField("filename", file.getName(), Store.YES));
+ document.add(new TextField("fulltext", fulltext));
writer.addDocument(document);
}
}
diff --git a/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java b/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java
index 5c6a9d4..dee4d13 100755
--- a/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java
+++ b/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java
@@ -23,9 +23,8 @@ import java.io.InputStream;
import java.util.Date;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.tika.Tika;
import org.apache.tika.metadata.DublinCore;
@@ -54,7 +53,7 @@ public class MetadataAwareLuceneIndexer {
for (String key : met.names()) {
String[] values = met.getValues(key);
for (String val : values) {
- document.add(new Field(key, val, Store.YES, Index.ANALYZED));
+ document.add(new TextField(key, val, Store.YES));
}
writer.addDocument(document);
}
@@ -79,7 +78,7 @@ public class MetadataAwareLuceneIndexer {
for (String key : met.names()) {
String[] values = met.getValues(key);
for (String val : values) {
- document.add(new Field(key, val, Store.YES, Index.ANALYZED));
+ document.add(new TextField(key, val, Store.YES));
}
writer.addDocument(document);
}
diff --git a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
index d6a259b..8fabd3a 100755
--- a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
+++ b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
@@ -17,8 +17,8 @@
package org.apache.tika.example;
-import java.io.File;
import java.io.IOException;
+import java.nio.file.Path;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.GregorianCalendar;
@@ -28,12 +28,14 @@ import java.util.TimeZone;
import org.apache.jackrabbit.util.ISO8601;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopScoreDocCollector;
-import org.apache.lucene.store.SimpleFSDirectory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.BytesRef;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
@@ -49,23 +51,24 @@ public class RecentFiles {
private SimpleDateFormat rssDateFormat = new SimpleDateFormat(
"E, dd MMM yyyy HH:mm:ss z", Locale.getDefault());
- public String generateRSS(File indexFile) throws CorruptIndexException,
+ public String generateRSS(Path indexFile) throws CorruptIndexException,
IOException {
StringBuffer output = new StringBuffer();
output.append(getRSSHeaders());
IndexSearcher searcher = null;
try {
- reader = IndexReader.open(new SimpleFSDirectory(indexFile));
+ reader = DirectoryReader.open(FSDirectory.open(indexFile));
searcher = new IndexSearcher(reader);
GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
gc.setTime(new Date());
String nowDateTime = ISO8601.format(gc);
gc.add(java.util.GregorianCalendar.MINUTE, -5);
String fiveMinsAgo = ISO8601.format(gc);
- TermRangeQuery query = new TermRangeQuery(Metadata.DATE.toString(),
- fiveMinsAgo, nowDateTime, true, true);
- TopScoreDocCollector collector = TopScoreDocCollector.create(20,
- true);
+ TermRangeQuery query = new TermRangeQuery(
+ Metadata.DATE.toString(),
+ new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime),
+ true, true);
+ TopScoreDocCollector collector = TopScoreDocCollector.create(20);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
for (int i = 0; i < hits.length; i++) {
@@ -75,7 +78,6 @@ public class RecentFiles {
} finally {
if (reader != null) reader.close();
- if (searcher != null) searcher.close();
}
output.append(getRSSFooters());