You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2016/08/11 05:44:22 UTC
mahout git commit: MAHOUT-1876: Upgrade lucene to 5.5.2 and fix
compilation failures, this closes apache/mahout#248
Repository: mahout
Updated Branches:
refs/heads/master 33c1eab11 -> 4d0cd66a6
MAHOUT-1876: Upgrade lucene to 5.5.2 and fix compilation failures, this closes apache/mahout#248
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/4d0cd66a
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/4d0cd66a
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/4d0cd66a
Branch: refs/heads/master
Commit: 4d0cd66a6269eb02fceaabdb11d70fd38d433474
Parents: 33c1eab
Author: smarthi <sm...@apache.org>
Authored: Thu Aug 11 01:42:30 2016 -0400
Committer: smarthi <sm...@apache.org>
Committed: Thu Aug 11 01:42:30 2016 -0400
----------------------------------------------------------------------
.../mahout/classifier/NewsgroupHelper.java | 3 +-
.../text/MailArchivesClusteringAnalyzer.java | 31 ++++++++------------
.../text/wikipedia/WikipediaAnalyzer.java | 17 +++++------
.../mahout/utils/regex/AnalyzerTransformer.java | 3 +-
.../vectors/lucene/AbstractLuceneIterator.java | 2 +-
.../utils/vectors/lucene/CachedTermInfo.java | 2 +-
.../utils/vectors/lucene/ClusterLabels.java | 19 ++++++------
.../mahout/utils/vectors/lucene/Driver.java | 3 +-
.../mahout/clustering/TestClusterDumper.java | 6 ++--
.../collocations/llr/BloomTokenFilterTest.java | 9 +++---
.../vectors/lucene/CachedTermInfoTest.java | 6 ++--
.../mahout/utils/vectors/lucene/DriverTest.java | 17 +++++------
.../vectors/lucene/LuceneIterableTest.java | 8 ++---
.../mahout/common/lucene/AnalyzerUtils.java | 4 +--
.../org/apache/mahout/vectorizer/TFIDF.java | 4 +--
.../encoders/LuceneTextValueEncoder.java | 10 ++-----
.../encoders/TextValueEncoderTest.java | 3 +-
pom.xml | 2 +-
18 files changed, 67 insertions(+), 82 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
index 3674a57..5cec51c 100644
--- a/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ b/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
@@ -60,7 +59,7 @@ public final class NewsgroupHelper {
private static final long WEEK = 7 * 24 * 3600;
private final Random rand = RandomUtils.getRandom();
- private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
+ private final Analyzer analyzer = new StandardAnalyzer();
private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
index 8776c5f..12ed471 100644
--- a/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
@@ -16,12 +16,6 @@
*/
package org.apache.mahout.text;
-import java.io.IOException;
-import java.io.Reader;
-import java.util.Arrays;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -34,7 +28,11 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
-import org.apache.lucene.util.Version;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
/**
* Custom Lucene Analyzer designed for aggressive feature reduction
@@ -42,13 +40,11 @@ import org.apache.lucene.util.Version;
* stop words, excluding non-alpha-numeric tokens, and porter stemming.
*/
public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
- private static final Version LUCENE_VERSION = Version.LUCENE_46;
-
// extended set of stop words composed of common mail terms like "hi",
// HTML tags, and Java keywords asmany of the messages in the archives
// are subversion check-in notifications
- private static final CharArraySet STOP_SET = new CharArraySet(LUCENE_VERSION, Arrays.asList(
+ private static final CharArraySet STOP_SET = new CharArraySet(Arrays.asList(
"3d","7bit","a0","about","above","abstract","across","additional","after",
"afterwards","again","against","align","all","almost","alone","along",
"already","also","although","always","am","among","amongst","amoungst",
@@ -108,22 +104,21 @@ public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
private static final Matcher MATCHER = ALPHA_NUMERIC.matcher("");
public MailArchivesClusteringAnalyzer() {
- super(LUCENE_VERSION, STOP_SET);
+ super(STOP_SET);
}
public MailArchivesClusteringAnalyzer(CharArraySet stopSet) {
- super(LUCENE_VERSION, stopSet);
-
+ super(stopSet);
}
@Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
- TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
- result = new LowerCaseFilter(LUCENE_VERSION, result);
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer();
+ TokenStream result = new StandardFilter(tokenizer);
+ result = new LowerCaseFilter(result);
result = new ASCIIFoldingFilter(result);
result = new AlphaNumericMaxLengthFilter(result);
- result = new StopFilter(LUCENE_VERSION, result, STOP_SET);
+ result = new StopFilter(result, STOP_SET);
result = new PorterStemFilter(result);
return new TokenStreamComponents(tokenizer, result);
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
index ad55ba7..d50323d 100644
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
+++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaAnalyzer.java
@@ -17,8 +17,6 @@
package org.apache.mahout.text.wikipedia;
-import java.io.Reader;
-
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
@@ -28,25 +26,24 @@ import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
-import org.apache.lucene.util.Version;
public class WikipediaAnalyzer extends StopwordAnalyzerBase {
public WikipediaAnalyzer() {
- super(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public WikipediaAnalyzer(CharArraySet stopSet) {
- super(Version.LUCENE_46, stopSet);
+ super(stopSet);
}
@Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new WikipediaTokenizer(reader);
- TokenStream result = new StandardFilter(Version.LUCENE_46, tokenizer);
- result = new LowerCaseFilter(Version.LUCENE_46, result);
- result = new StopFilter(Version.LUCENE_46, result, getStopwordSet());
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new WikipediaTokenizer();
+ TokenStream result = new StandardFilter(tokenizer);
+ result = new LowerCaseFilter(result);
+ result = new StopFilter(result, getStopwordSet());
return new TokenStreamComponents(tokenizer, result);
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
index 36b166a..4585a0a 100644
--- a/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
+++ b/integration/src/main/java/org/apache/mahout/utils/regex/AnalyzerTransformer.java
@@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.lucene.TokenStreamIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,7 +36,7 @@ public class AnalyzerTransformer implements RegexTransformer {
private static final Logger log = LoggerFactory.getLogger(AnalyzerTransformer.class);
public AnalyzerTransformer() {
- this(new StandardAnalyzer(Version.LUCENE_46), "text");
+ this(new StandardAnalyzer(), "text");
}
public AnalyzerTransformer(Analyzer analyzer) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
index 233c95c..ff61a70 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
@@ -113,7 +113,7 @@ public abstract class AbstractLuceneIterator extends AbstractIterator<Vector> {
// The loop exits with termFreqVector and name set.
- TermsEnum te = termFreqVector.iterator(null);
+ TermsEnum te = termFreqVector.iterator();
BytesRef term;
TFDFMapper mapper = new TFDFMapper(indexReader.numDocs(), weight, this.terminfo);
mapper.setExpectations(field, termFreqVector.size());
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
index 718704a..0b59ed6 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
@@ -42,7 +42,7 @@ public class CachedTermInfo implements TermInfo {
public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
this.field = field;
Terms t = MultiFields.getTerms(reader, field);
- TermsEnum te = t.iterator(null);
+ TermsEnum te = t.iterator();
int numDocs = reader.numDocs();
double percent = numDocs * maxDfPercent / 100.0;
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
index 6ef7fba..b2568e7 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
@@ -21,6 +21,7 @@ import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.nio.file.Paths;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -44,9 +45,9 @@ import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@@ -55,7 +56,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -149,7 +150,7 @@ public class ClusterLabels {
}
log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
- Directory dir = FSDirectory.open(new File(this.indexDir));
+ Directory dir = FSDirectory.open(Paths.get(this.indexDir));
IndexReader reader = DirectoryReader.open(dir);
@@ -165,7 +166,7 @@ public class ClusterLabels {
int numDocs = reader.numDocs();
- OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
+ FixedBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);
log.info("Populating term infos from the index");
@@ -179,7 +180,7 @@ public class ClusterLabels {
* frequency.
*/
Terms t = MultiFields.getTerms(reader, contentField);
- TermsEnum te = t.iterator(null);
+ TermsEnum te = t.iterator();
Map<String, TermEntry> termEntryMap = new LinkedHashMap<>();
Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions
@@ -187,8 +188,8 @@ public class ClusterLabels {
int count = 0;
BytesRef term;
while ((term = te.next()) != null) {
- OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
- DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
+ FixedBitSet termBitset = new FixedBitSet(reader.maxDoc());
+ PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader, contentField, term);
int docID;
while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
//check to see if we don't have an deletions (null) or if document is live
@@ -230,12 +231,12 @@ public class ClusterLabels {
return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}
- private static OpenBitSet getClusterDocBitset(IndexReader reader,
+ private static FixedBitSet getClusterDocBitset(IndexReader reader,
Collection<String> idSet,
String idField) throws IOException {
int numDocs = reader.numDocs();
- OpenBitSet bitset = new OpenBitSet(numDocs);
+ FixedBitSet bitset = new FixedBitSet(numDocs);
Set<String> idFieldSelector = null;
if (idField != null) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
index 2eeebd9..876816f 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
@@ -20,6 +20,7 @@ package org.apache.mahout.utils.vectors.lucene;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
+import java.nio.file.Paths;
import java.util.Iterator;
import com.google.common.base.Preconditions;
@@ -85,7 +86,7 @@ public final class Driver {
Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 99");
- Directory dir = FSDirectory.open(file);
+ Directory dir = FSDirectory.open(Paths.get(file.getAbsolutePath()));
IndexReader reader = DirectoryReader.open(dir);
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
index a1d2bbb..01d46fc 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -31,11 +31,11 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
@@ -94,7 +94,7 @@ public final class TestClusterDumper extends MahoutTestCase {
sampleData = new ArrayList<>();
RAMDirectory directory = new RAMDirectory();
try (IndexWriter writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46)))){
+ new IndexWriterConfig(new StandardAnalyzer()))){
for (int i = 0; i < docs2.length; i++) {
Document doc = new Document();
Field id = new StringField("id", "doc_" + i, Field.Store.YES);
@@ -102,7 +102,7 @@ public final class TestClusterDumper extends MahoutTestCase {
// Store both position and offset information
FieldType fieldType = new FieldType();
fieldType.setStored(false);
- fieldType.setIndexed(true);
+ fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setTokenized(true);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
index 37efc01..4fdbbbc 100644
--- a/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
@@ -36,7 +36,6 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.junit.Test;
@@ -79,7 +78,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
@Test
public void testAnalyzer() throws IOException {
Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+ Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream(null, reader);
ts.reset();
validateTokens(allTokens, ts);
@@ -91,7 +90,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
@Test
public void testNonKeepdAnalyzer() throws IOException {
Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+ Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream(null, reader);
ts.reset();
TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
@@ -104,7 +103,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
@Test
public void testKeepAnalyzer() throws IOException {
Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+ Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream(null, reader);
ts.reset();
TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
@@ -117,7 +116,7 @@ public final class BloomTokenFilterTest extends MahoutTestCase {
@Test
public void testShingleFilteredAnalyzer() throws IOException {
Reader reader = new StringReader(input);
- Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
+ Analyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream(null, reader);
ts.reset();
ShingleFilter sf = new ShingleFilter(ts, 3);
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
index 44a91e9..890a14b 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
@@ -28,11 +28,11 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.junit.Before;
import org.junit.Test;
@@ -65,7 +65,7 @@ public class CachedTermInfoTest extends MahoutTestCase {
FieldType fieldType = new FieldType();
fieldType.setStored(false);
- fieldType.setIndexed(true);
+ fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setTokenized(true);
fieldType.setStoreTermVectors(false);
fieldType.setStoreTermVectorPositions(false);
@@ -100,7 +100,7 @@ public class CachedTermInfoTest extends MahoutTestCase {
static RAMDirectory createTestIndex(FieldType fieldType,
RAMDirectory directory,
int startingId) throws IOException {
- IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46, new WhitespaceAnalyzer(Version.LUCENE_46)));
+ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new WhitespaceAnalyzer()));
try {
for (int i = 0; i < DOCS.length; i++) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
index 6ac2df8..86c8305 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/DriverTest.java
@@ -30,18 +30,18 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
-import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Paths;
import java.util.Set;
public class DriverTest extends MahoutTestCase {
@@ -73,9 +73,8 @@ public class DriverTest extends MahoutTestCase {
public static final FieldType TYPE = new FieldType();
static {
- TYPE.setIndexed(true);
TYPE.setOmitNorms(true);
- TYPE.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS);
+ TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
TYPE.setStored(true);
TYPE.setTokenized(true);
TYPE.setStoreTermVectors(true);
@@ -90,9 +89,10 @@ public class DriverTest extends MahoutTestCase {
@Test
public void sequenceFileDictionary() throws IOException {
- Directory index = new SimpleFSDirectory(indexDir);
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
+ Directory index = new SimpleFSDirectory(Paths.get(indexDir.getAbsolutePath()));
+ Analyzer analyzer = new StandardAnalyzer();
+ IndexWriterConfig config = new IndexWriterConfig(analyzer);
+ config.setCommitOnClose(true);
final IndexWriter writer = new IndexWriter(index, config);
try {
@@ -100,9 +100,8 @@ public class DriverTest extends MahoutTestCase {
writer.addDocument(asDocument("One Ring to find them,"));
writer.addDocument(asDocument("One Ring to bring them all"));
writer.addDocument(asDocument("and in the darkness bind them"));
-
} finally {
- writer.close(true);
+ writer.close();
}
File seqDict = new File(outputDir, "dict.seq");
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
index ba49a2d..8d92551 100644
--- a/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
+++ b/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
@@ -29,11 +29,11 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
@@ -62,14 +62,14 @@ public final class LuceneIterableTest extends MahoutTestCase {
@Before
public void before() throws IOException {
- TYPE_NO_TERM_VECTORS.setIndexed(true);
+ TYPE_NO_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_NO_TERM_VECTORS.setTokenized(true);
TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
TYPE_NO_TERM_VECTORS.freeze();
- TYPE_TERM_VECTORS.setIndexed(true);
+ TYPE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_TERM_VECTORS.setTokenized(true);
TYPE_TERM_VECTORS.setStored(true);
TYPE_TERM_VECTORS.setStoreTermVectors(true);
@@ -177,7 +177,7 @@ public final class LuceneIterableTest extends MahoutTestCase {
RAMDirectory directory,
int startingId) throws IOException {
- try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_46,new StandardAnalyzer(Version.LUCENE_46)))) {
+ try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()))) {
for (int i = 0; i < DOCS.length; i++) {
Document doc = new Document();
Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
index 37ca383..742d6cf 100644
--- a/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
+++ b/mr/src/main/java/org/apache/mahout/common/lucene/AnalyzerUtils.java
@@ -32,7 +32,7 @@ public final class AnalyzerUtils {
* @throws ClassNotFoundException - {@link ClassNotFoundException}
*/
public static Analyzer createAnalyzer(String analyzerClassName) throws ClassNotFoundException {
- return createAnalyzer(analyzerClassName, Version.LUCENE_46);
+ return createAnalyzer(analyzerClassName, Version.LUCENE_5_5_2);
}
public static Analyzer createAnalyzer(String analyzerClassName, Version version) throws ClassNotFoundException {
@@ -47,7 +47,7 @@ public final class AnalyzerUtils {
* @return {@link Analyzer}
*/
public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass) {
- return createAnalyzer(analyzerClass, Version.LUCENE_46);
+ return createAnalyzer(analyzerClass, Version.LUCENE_5_5_2);
}
public static Analyzer createAnalyzer(Class<? extends Analyzer> analyzerClass, Version version) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
index 0a537eb..238fa03 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/TFIDF.java
@@ -17,11 +17,11 @@
package org.apache.mahout.vectorizer;
-import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.ClassicSimilarity;
//TODO: add a new class that supports arbitrary Lucene similarity implementations
public class TFIDF implements Weight {
- private final DefaultSimilarity sim = new DefaultSimilarity();
+ private final ClassicSimilarity sim = new ClassicSimilarity();
@Override
public double calculate(int tf, int df, int length, int numDocs) {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
----------------------------------------------------------------------
diff --git a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
index 3bae26e..e3e133c 100644
--- a/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
+++ b/mr/src/main/java/org/apache/mahout/vectorizer/encoders/LuceneTextValueEncoder.java
@@ -49,13 +49,9 @@ public class LuceneTextValueEncoder extends TextValueEncoder {
*/
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
- try {
- TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
- ts.addAttribute(CharTermAttribute.class);
- return new LuceneTokenIterable(ts, false);
- } catch (IOException ex) {
- throw new IllegalStateException(ex);
- }
+ TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
+ ts.addAttribute(CharTermAttribute.class);
+ return new LuceneTokenIterable(ts, false);
}
private static final class CharSequenceReader extends Reader {
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
----------------------------------------------------------------------
diff --git a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
index 4446fef..be3e03e 100644
--- a/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
+++ b/mr/src/test/java/org/apache/mahout/vectorizer/encoders/TextValueEncoderTest.java
@@ -19,7 +19,6 @@ package org.apache.mahout.vectorizer.encoders;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
@@ -70,7 +69,7 @@ public final class TextValueEncoderTest extends MahoutTestCase {
@Test
public void testLuceneEncoding() throws Exception {
LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
- enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46));
+ enc.setAnalyzer(new WhitespaceAnalyzer());
Vector v1 = new DenseVector(200);
enc.addToVector("test1 and more", v1);
enc.flush(1, v1);
http://git-wip-us.apache.org/repos/asf/mahout/blob/4d0cd66a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index ca0ea21..165e42e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,7 +117,7 @@
<mjavadoc.version>2.10.3</mjavadoc.version>
<mscala.version>3.2.0</mscala.version>
<hbase.version>1.0.0</hbase.version>
- <lucene.version>4.6.1</lucene.version>
+ <lucene.version>5.5.2</lucene.version>
<slf4j.version>1.7.19</slf4j.version>
<scala.compat.version>2.10</scala.compat.version>
<scala.version>2.10.4</scala.version>