You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/11/28 06:44:44 UTC
svn commit: r1546288 - in /mahout/trunk/integration/src:
main/java/org/apache/mahout/utils/vectors/lucene/
test/java/org/apache/mahout/clustering/
test/java/org/apache/mahout/utils/vectors/lucene/
Author: smarthi
Date: Thu Nov 28 05:44:44 2013
New Revision: 1546288
URL: http://svn.apache.org/r1546288
Log:
MAHOUT-1343: More Lucene 3.x calls that need to be replaced by equivalent Lucene 4.x API
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java Thu Nov 28 05:44:44 2013
@@ -17,6 +17,7 @@
package org.apache.mahout.utils.vectors.lucene;
+import com.google.common.collect.Maps;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
@@ -27,7 +28,6 @@ import org.apache.mahout.utils.vectors.T
import java.io.IOException;
import java.util.Iterator;
-import java.util.LinkedHashMap;
import java.util.Map;
@@ -47,16 +47,15 @@ public class CachedTermInfo implements T
int numDocs = reader.numDocs();
double percent = numDocs * maxDfPercent / 100.0;
//Should we use a linked hash map so that we know terms are in order?
- termEntries = new LinkedHashMap<String, TermEntry>();
+ termEntries = Maps.newLinkedHashMap();
int count = 0;
BytesRef text;
while ((text = te.next()) != null) {
int df = te.docFreq();
- if (df < minDf || df > percent) {
- continue;
+ if (df >= minDf && df <= percent) {
+ TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
+ termEntries.put(entry.getTerm(), entry);
}
- TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
- termEntries.put(entry.getTerm(), entry);
}
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Nov 28 05:44:44 2013
@@ -97,8 +97,7 @@ public final class TestClusterDumper ext
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory,
- new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(
- Version.LUCENE_43)));
+ new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43)));
try {
for (int i = 0; i < docs2.length; i++) {
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java Thu Nov 28 05:44:44 2013
@@ -25,6 +25,7 @@ import com.google.common.io.Closeables;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -33,6 +34,7 @@ import org.apache.lucene.index.IndexWrit
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
import org.junit.Test;
public class CachedTermInfoTest extends MahoutTestCase {
@@ -57,11 +59,20 @@ public class CachedTermInfoTest extends
"e"
};
- @Override
- public void setUp() throws Exception {
- super.setUp();
+ @Before
+ public void before() throws IOException {
directory = new RAMDirectory();
- directory = createTestIndex(Field.TermVector.NO, directory, true, 0);
+
+ FieldType fieldType = new FieldType();
+ fieldType.setStored(false);
+ fieldType.setIndexed(true);
+ fieldType.setTokenized(true);
+ fieldType.setStoreTermVectors(false);
+ fieldType.setStoreTermVectorPositions(false);
+ fieldType.setStoreTermVectorOffsets(false);
+ fieldType.freeze();
+
+ directory = createTestIndex(fieldType, directory, 0);
}
@Test
@@ -86,9 +97,8 @@ public class CachedTermInfoTest extends
}
- static RAMDirectory createTestIndex(Field.TermVector termVector,
+ static RAMDirectory createTestIndex(FieldType fieldType,
RAMDirectory directory,
- boolean createNew,
int startingId) throws IOException {
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43)));
@@ -97,11 +107,9 @@ public class CachedTermInfoTest extends
Document doc = new Document();
Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
doc.add(id);
- //Store both position and offset information
- //Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors...
- Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+ Field text = new Field("content", DOCS[i], fieldType);
doc.add(text);
- Field text2 = new Field("content2", DOCS2[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+ Field text2 = new Field("content2", DOCS2[i], fieldType);
doc.add(text2);
writer.addDocument(doc);
}
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Thu Nov 28 05:44:44 2013
@@ -25,6 +25,7 @@ import com.google.common.io.Closeables;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -38,6 +39,7 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.vectors.TermInfo;
import org.apache.mahout.vectorizer.TFIDF;
import org.apache.mahout.vectorizer.Weight;
+import org.junit.Before;
import org.junit.Test;
public final class LuceneIterableTest extends MahoutTestCase {
@@ -52,10 +54,29 @@ public final class LuceneIterableTest ex
private RAMDirectory directory;
- @Override
- public void setUp() throws Exception {
- super.setUp();
- directory = createTestIndex(Field.TermVector.YES);
+ private final FieldType TYPE_NO_TERM_VECTORS = new FieldType();
+
+ private final FieldType TYPE_TERM_VECTORS = new FieldType();
+
+ @Before
+ public void before() throws IOException {
+
+ TYPE_NO_TERM_VECTORS.setIndexed(true);
+ TYPE_NO_TERM_VECTORS.setTokenized(true);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
+ TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
+ TYPE_NO_TERM_VECTORS.freeze();
+
+ TYPE_TERM_VECTORS.setIndexed(true);
+ TYPE_TERM_VECTORS.setTokenized(true);
+ TYPE_TERM_VECTORS.setStored(true);
+ TYPE_TERM_VECTORS.setStoreTermVectors(true);
+ TYPE_TERM_VECTORS.setStoreTermVectorPositions(true);
+ TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true);
+ TYPE_TERM_VECTORS.freeze();
+
+ directory = createTestIndex(TYPE_TERM_VECTORS);
}
@Test
@@ -87,7 +108,7 @@ public final class LuceneIterableTest ex
@Test(expected = IllegalStateException.class)
public void testIterableNoTermVectors() throws IOException {
- RAMDirectory directory = createTestIndex(Field.TermVector.NO);
+ RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS);
IndexReader reader = DirectoryReader.open(directory);
@@ -103,9 +124,9 @@ public final class LuceneIterableTest ex
@Test
public void testIterableSomeNoiseTermVectors() throws IOException {
//get noise vectors
- RAMDirectory directory = createTestIndex(Field.TermVector.YES, new RAMDirectory(), true, 0);
+ RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new RAMDirectory(), 0);
//get real vectors
- createTestIndex(Field.TermVector.NO, directory, false, 5);
+ createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5);
IndexReader reader = DirectoryReader.open(directory);
Weight weight = new TFIDF();
@@ -113,7 +134,7 @@ public final class LuceneIterableTest ex
boolean exceptionThrown;
//0 percent tolerance
- LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo, weight);
try {
for (Object a : iterable) {
}
@@ -157,16 +178,15 @@ public final class LuceneIterableTest ex
assertTrue(exceptionThrown);
}
- static RAMDirectory createTestIndex(Field.TermVector termVector) throws IOException {
- return createTestIndex(termVector, new RAMDirectory(), true, 0);
+ static RAMDirectory createTestIndex(FieldType fieldType) throws IOException {
+ return createTestIndex(fieldType, new RAMDirectory(), 0);
}
- static RAMDirectory createTestIndex(Field.TermVector termVector,
+ static RAMDirectory createTestIndex(FieldType fieldType,
RAMDirectory directory,
- boolean createNew,
int startingId) throws IOException {
- IndexWriter writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
-
+ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
+
try {
for (int i = 0; i < DOCS.length; i++) {
Document doc = new Document();
@@ -174,9 +194,9 @@ public final class LuceneIterableTest ex
doc.add(id);
//Store both position and offset information
//Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors...
- Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+ Field text = new Field("content", DOCS[i], fieldType);
doc.add(text);
- Field text2 = new Field("content2", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+ Field text2 = new Field("content2", DOCS[i], fieldType);
doc.add(text2);
writer.addDocument(doc);
}