You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/11/28 06:44:44 UTC

svn commit: r1546288 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/clustering/ test/java/org/apache/mahout/utils/vectors/lucene/

Author: smarthi
Date: Thu Nov 28 05:44:44 2013
New Revision: 1546288

URL: http://svn.apache.org/r1546288
Log:
MAHOUT-1343: More Lucene 3.x calls that need to be replaced by equivalent Lucene 4.x API

Modified:
    mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java Thu Nov 28 05:44:44 2013
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.utils.vectors.lucene;
 
+import com.google.common.collect.Maps;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Terms;
@@ -27,7 +28,6 @@ import org.apache.mahout.utils.vectors.T
 
 import java.io.IOException;
 import java.util.Iterator;
-import java.util.LinkedHashMap;
 import java.util.Map;
 
 
@@ -47,16 +47,15 @@ public class CachedTermInfo implements T
     int numDocs = reader.numDocs();
     double percent = numDocs * maxDfPercent / 100.0;
     //Should we use a linked hash map so that we know terms are in order?
-    termEntries = new LinkedHashMap<String, TermEntry>();
+    termEntries = Maps.newLinkedHashMap();
     int count = 0;
     BytesRef text;
     while ((text = te.next()) != null) {
       int df = te.docFreq();
-      if (df < minDf || df > percent) {
-        continue;
+      if (df >= minDf && df <= percent) {
+        TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
+        termEntries.put(entry.getTerm(), entry);
       }
-      TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
-      termEntries.put(entry.getTerm(), entry);
     }
   }
 

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Nov 28 05:44:44 2013
@@ -97,8 +97,7 @@ public final class TestClusterDumper ext
     RAMDirectory directory = new RAMDirectory();
     
     IndexWriter writer = new IndexWriter(directory, 
-           new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(
-        Version.LUCENE_43)));
+           new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43)));
             
     try {
       for (int i = 0; i < docs2.length; i++) {

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java Thu Nov 28 05:44:44 2013
@@ -25,6 +25,7 @@ import com.google.common.io.Closeables;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -33,6 +34,7 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
 import org.junit.Test;
 
 public class CachedTermInfoTest extends MahoutTestCase {
@@ -57,11 +59,20 @@ public class CachedTermInfoTest extends 
           "e"
   };
 
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
+  @Before
+  public void before() throws IOException {
     directory = new RAMDirectory();
-    directory = createTestIndex(Field.TermVector.NO, directory, true, 0);
+
+    FieldType fieldType = new FieldType();
+    fieldType.setStored(false);
+    fieldType.setIndexed(true);
+    fieldType.setTokenized(true);
+    fieldType.setStoreTermVectors(false);
+    fieldType.setStoreTermVectorPositions(false);
+    fieldType.setStoreTermVectorOffsets(false);
+    fieldType.freeze();
+
+    directory = createTestIndex(fieldType, directory, 0);
   }
 
   @Test
@@ -86,9 +97,8 @@ public class CachedTermInfoTest extends 
 
   }
 
-  static RAMDirectory createTestIndex(Field.TermVector termVector,
+  static RAMDirectory createTestIndex(FieldType fieldType,
                                       RAMDirectory directory,
-                                      boolean createNew,
                                       int startingId) throws IOException {
     IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43)));
 
@@ -97,11 +107,9 @@ public class CachedTermInfoTest extends 
         Document doc = new Document();
         Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES);
         doc.add(id);
-        //Store both position and offset information
-        //Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors...
-        Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+        Field text = new Field("content", DOCS[i], fieldType);
         doc.add(text);
-        Field text2 = new Field("content2", DOCS2[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+        Field text2 = new Field("content2", DOCS2[i], fieldType);
         doc.add(text2);
         writer.addDocument(doc);
       }

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1546288&r1=1546287&r2=1546288&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Thu Nov 28 05:44:44 2013
@@ -25,6 +25,7 @@ import com.google.common.io.Closeables;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -38,6 +39,7 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.utils.vectors.TermInfo;
 import org.apache.mahout.vectorizer.TFIDF;
 import org.apache.mahout.vectorizer.Weight;
+import org.junit.Before;
 import org.junit.Test;
 
 public final class LuceneIterableTest extends MahoutTestCase {
@@ -52,10 +54,29 @@ public final class LuceneIterableTest ex
 
   private RAMDirectory directory;
 
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    directory = createTestIndex(Field.TermVector.YES);
+  private final FieldType TYPE_NO_TERM_VECTORS = new FieldType();
+
+  private final FieldType TYPE_TERM_VECTORS = new FieldType();
+
+  @Before
+  public void before() throws IOException {
+
+    TYPE_NO_TERM_VECTORS.setIndexed(true);
+    TYPE_NO_TERM_VECTORS.setTokenized(true);
+    TYPE_NO_TERM_VECTORS.setStoreTermVectors(false);
+    TYPE_NO_TERM_VECTORS.setStoreTermVectorPositions(false);
+    TYPE_NO_TERM_VECTORS.setStoreTermVectorOffsets(false);
+    TYPE_NO_TERM_VECTORS.freeze();
+
+    TYPE_TERM_VECTORS.setIndexed(true);
+    TYPE_TERM_VECTORS.setTokenized(true);
+    TYPE_TERM_VECTORS.setStored(true);
+    TYPE_TERM_VECTORS.setStoreTermVectors(true);
+    TYPE_TERM_VECTORS.setStoreTermVectorPositions(true);
+    TYPE_TERM_VECTORS.setStoreTermVectorOffsets(true);
+    TYPE_TERM_VECTORS.freeze();
+
+    directory = createTestIndex(TYPE_TERM_VECTORS);
   }
 
   @Test
@@ -87,7 +108,7 @@ public final class LuceneIterableTest ex
 
   @Test(expected = IllegalStateException.class)
   public void testIterableNoTermVectors() throws IOException {
-    RAMDirectory directory = createTestIndex(Field.TermVector.NO);
+    RAMDirectory directory = createTestIndex(TYPE_NO_TERM_VECTORS);
     IndexReader reader = DirectoryReader.open(directory);
     
     
@@ -103,9 +124,9 @@ public final class LuceneIterableTest ex
   @Test
   public void testIterableSomeNoiseTermVectors() throws IOException {
     //get noise vectors
-    RAMDirectory directory = createTestIndex(Field.TermVector.YES, new RAMDirectory(), true, 0);
+    RAMDirectory directory = createTestIndex(TYPE_TERM_VECTORS, new RAMDirectory(), 0);
     //get real vectors
-    createTestIndex(Field.TermVector.NO, directory, false, 5);
+    createTestIndex(TYPE_NO_TERM_VECTORS, directory, 5);
     IndexReader reader = DirectoryReader.open(directory);
 
     Weight weight = new TFIDF();
@@ -113,7 +134,7 @@ public final class LuceneIterableTest ex
     
     boolean exceptionThrown;
     //0 percent tolerance
-    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight);
+    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo, weight);
     try {
       for (Object a : iterable) {
       }
@@ -157,16 +178,15 @@ public final class LuceneIterableTest ex
     assertTrue(exceptionThrown);
   }
   
-  static RAMDirectory createTestIndex(Field.TermVector termVector) throws IOException {
-      return createTestIndex(termVector, new RAMDirectory(), true, 0);
+  static RAMDirectory createTestIndex(FieldType fieldType) throws IOException {
+      return createTestIndex(fieldType, new RAMDirectory(), 0);
   }
   
-  static RAMDirectory createTestIndex(Field.TermVector termVector,
+  static RAMDirectory createTestIndex(FieldType fieldType,
                                               RAMDirectory directory,
-                                              boolean createNew,
                                               int startingId) throws IOException {
-    IndexWriter writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
-        
+    IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_43,new StandardAnalyzer(Version.LUCENE_43)));
+
     try {
       for (int i = 0; i < DOCS.length; i++) {
         Document doc = new Document();
@@ -174,9 +194,9 @@ public final class LuceneIterableTest ex
         doc.add(id);
         //Store both position and offset information
         //Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors...
-        Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+        Field text = new Field("content", DOCS[i], fieldType);
         doc.add(text);
-        Field text2 = new Field("content2", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector);
+        Field text2 = new Field("content2", DOCS[i], fieldType);
         doc.add(text2);
         writer.addDocument(doc);
       }