You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/17 16:22:14 UTC

svn commit: r785618 - in /lucene/mahout/trunk: ./ examples/ maven/ utils/ utils/src/ utils/src/main/ utils/src/main/java/ utils/src/main/java/org/ utils/src/main/java/org/apache/ utils/src/main/java/org/apache/mahout/ utils/src/main/java/org/apache/mah...

Author: gsingers
Date: Wed Jun 17 14:22:08 2009
New Revision: 785618

URL: http://svn.apache.org/viewvc?rev=785618&view=rev
Log:
MAHOUT-126: Commit first iteration of this patch

Added:
    lucene/mahout/trunk/utils/   (with props)
    lucene/mahout/trunk/utils/pom.xml
    lucene/mahout/trunk/utils/src/
    lucene/mahout/trunk/utils/src/main/
    lucene/mahout/trunk/utils/src/main/java/
    lucene/mahout/trunk/utils/src/main/java/org/
    lucene/mahout/trunk/utils/src/main/java/org/apache/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java
    lucene/mahout/trunk/utils/src/main/resources/
    lucene/mahout/trunk/utils/src/test/
    lucene/mahout/trunk/utils/src/test/java/
    lucene/mahout/trunk/utils/src/test/java/org/
    lucene/mahout/trunk/utils/src/test/java/org/apache/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
    lucene/mahout/trunk/utils/src/test/resources/
Modified:
    lucene/mahout/trunk/examples/build-deprecated.xml
    lucene/mahout/trunk/examples/pom.xml
    lucene/mahout/trunk/maven/build.xml
    lucene/mahout/trunk/pom.xml

Modified: lucene/mahout/trunk/examples/build-deprecated.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/build-deprecated.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/build-deprecated.xml (original)
+++ lucene/mahout/trunk/examples/build-deprecated.xml Wed Jun 17 14:22:08 2009
@@ -262,59 +262,4 @@
   <!-- EXAMPLES -->
 
 
-
-  <property name="working.dir" value="work"/>
-  <target name="check-files">
-    <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
-    <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
-    <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
-      <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
-      
-  </target>
-
-  <target name="enwiki-files" depends="check-files">
-        <mkdir dir="temp"/>
-        <antcall target="get-enwiki"/>
-        <antcall target="expand-enwiki"/>
-    </target>
-
-    <target name="get-enwiki" unless="enwiki.exists">
-        <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
-             dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
-    </target>
-
-    <target name="expand-enwiki"  unless="enwiki.expanded">
-        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
-    </target>
-
-
-  <target name="get-20news-18828" unless="20news-18828.exists">
-    <get src="http://people.csail.mit.edu/jrennie/20Newsgroups/20news-18828.tar.gz"
-         dest="temp/20news-18828.tar.gz"/>
-
-  </target>
-  <target name="expand-20news-18828" unless="20news-18828.expanded">
-    <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
-    <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
-  </target>
-
-  <target name="extract-20news-18828" depends="check-files, compile" unless="reuters.extracted">
-    <mkdir dir="${working.dir}/20news-18828-collapse"/>
-    <java classname="org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups" maxmemory="1024M" fork="true">
-      <classpath refid="test.classpath"/>
-      <!--
-      Input format is:
-      inputDir outputDir label Analyzer character set
-      -->
-      <arg line="-p ${working.dir}/20news-18828/ -o ${working.dir}/20news-18828-collapse -a org.apache.lucene.analysis.standard.StandardAnalyzer -c UTF-8"/>
-    </java>
-  </target>
-
-  <target name="get-files" depends="check-files"  description="Get and extract the 20 Newsgroups data">
-    <mkdir dir="temp"/>
-    <antcall target="get-20news-18828"/>
-    <antcall target="expand-20news-18828"/>
-    <!--<antcall target="extract-20news-18828"/>-->
-  </target>
-
 </project>

Modified: lucene/mahout/trunk/examples/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/pom.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/pom.xml (original)
+++ lucene/mahout/trunk/examples/pom.xml Wed Jun 17 14:22:08 2009
@@ -102,7 +102,44 @@
               <goal>run</goal>
             </goals>
           </execution>
+          <execution>
+            <id>get-20news</id>
+            <phase>process-classes</phase>
+            <configuration>
+              <tasks if="get.20news">
+                <ant antfile="../maven/build.xml" target="get-files">
+                  <property name="dest" value="${project.build.directory}" />
+                  <property name="fullnamever" value="${project.artifactId}-${project.version}" />
+                  <property name="core-lib" value="../core/lib" />
+                  <property name="shared-lib" value="../lib" />
+                  <property name="version" value="${project.version}" />
+                </ant>
+              </tasks>
 
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>get-enwiki</id>
+            <phase>process-classes</phase>
+            <configuration>
+              <tasks if="get.enwiki">
+                <ant antfile="../maven/build.xml" target="enwiki-files">
+                  <property name="dest" value="${project.build.directory}" />
+                  <property name="fullnamever" value="${project.artifactId}-${project.version}" />
+                  <property name="core-lib" value="../core/lib" />
+                  <property name="shared-lib" value="../lib" />
+                  <property name="version" value="${project.version}" />
+                </ant>
+              </tasks>
+
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
         </executions>
       </plugin>
 

Modified: lucene/mahout/trunk/maven/build.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/maven/build.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/maven/build.xml (original)
+++ lucene/mahout/trunk/maven/build.xml Wed Jun 17 14:22:08 2009
@@ -64,5 +64,66 @@
       </fileset>
     </jar>
   </target>
+  <!-- Examples -->
+  <property name="working.dir" value="work"/>
+  <target name="check-files">
+    <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
+    <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
+    <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
+      <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
+
+  </target>
+
+  <target name="enwiki-files" depends="check-files">
+        <mkdir dir="temp"/>
+        <antcall target="get-enwiki"/>
+        <antcall target="expand-enwiki"/>
+    </target>
+
+    <target name="get-enwiki" unless="enwiki.exists">
+        <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
+             dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
+    </target>
+
+    <target name="expand-enwiki"  unless="enwiki.expanded">
+        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
+    </target>
+
+
+  <target name="get-20news-18828" unless="20news-18828.exists">
+    <get src="http://people.csail.mit.edu/jrennie/20Newsgroups/20news-18828.tar.gz"
+         dest="temp/20news-18828.tar.gz"/>
+
+  </target>
+  <target name="expand-20news-18828" unless="20news-18828.expanded">
+    <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
+    <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
+  </target>
+
+  <target  name="get-20news" depends="check-files">
+    <antcall target="get-20news-18828"/>
+    <antcall target="expand-20news-18828"/>
+  </target>
+
+  <target name="extract-20news-18828" depends="check-files" unless="reuters.extracted">
+    <mkdir dir="${working.dir}/20news-18828-collapse"/>
+    <java classname="org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups" maxmemory="1024M" fork="true">
+      <classpath refid="maven.test.classpath"/>
+      <!--
+      Input format is:
+      inputDir outputDir label Analyzer character set
+      -->
+      <arg line="-p ${working.dir}/20news-18828/ -o ${working.dir}/20news-18828-collapse -a org.apache.lucene.analysis.standard.StandardAnalyzer -c UTF-8"/>
+    </java>
+  </target>
+
+  <target name="get-files" depends="check-files"  description="Get and extract the 20 Newsgroups data">
+    <mkdir dir="temp"/>
+    <antcall target="get-20news"/>
+    <antcall target="enwiki-files"/>
+
+    <!--<antcall target="extract-20news-18828"/>-->
+  </target>
+
 
 </project>

Modified: lucene/mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/pom.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/pom.xml (original)
+++ lucene/mahout/trunk/pom.xml Wed Jun 17 14:22:08 2009
@@ -23,6 +23,7 @@
     <module>core</module>
     <module>taste-web</module>
     <module>examples</module>
+    <module>utils</module>
   </modules>
 
   <build>

Propchange: lucene/mahout/trunk/utils/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Jun 17 14:22:08 2009
@@ -0,0 +1,2 @@
+*.iml
+target

Added: lucene/mahout/trunk/utils/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/pom.xml?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/pom.xml (added)
+++ lucene/mahout/trunk/utils/pom.xml Wed Jun 17 14:22:08 2009
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.mahout</groupId>
+    <artifactId>mahout-parent</artifactId>
+    <version>1.0</version>
+    <relativePath>../maven</relativePath>
+  </parent>
+
+  <groupId>org.apache.mahout</groupId>
+  <artifactId>mahout-utils</artifactId>
+  <version>0.2-SNAPSHOT</version>
+  <name>Mahout utilities</name>
+  <description>Utilities for preparing content into formats for Mahout.</description>
+
+  <packaging>jar</packaging>
+
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <version>2.3</version>
+        <configuration>
+          <encoding>UTF-8</encoding>
+        </configuration>
+        <executions>
+          <execution>
+            <id>copy-resources</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>
+                ${project.build.directory}/classes/META-INF
+              </outputDirectory>
+              <resources>
+                <resource>
+                  <directory>..</directory>
+                  <includes>
+                    <include>README.txt</include>
+                    <include>NOTICE.txt</include>
+                    <include>LICENSE.txt</include>
+                  </includes>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <encoding>UTF-8</encoding>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-dependencies</id>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <!-- configure the plugin here -->
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+
+  </build>
+
+  <dependencies>
+
+    <dependency>
+      <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-solrj</artifactId>
+      <version>1.4-SNAPSHOT</version>
+    </dependency>
+    <!-- core test -->
+    <!--<dependency>
+      <groupId>org.apache.mahout</groupId>
+      <artifactId>mahout-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>-->
+
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymock</artifactId>
+      <version>2.4</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.easymock</groupId>
+      <artifactId>easymockclassextension</artifactId>
+      <version>2.4</version>
+      <scope>test</scope>
+    </dependency>
+
+    <!--  cglib contains nested dependencies that interfere with easymock,
+          thus the cglib references needs to be below easymock  -->
+    <dependency>
+      <groupId>cglib</groupId>
+      <artifactId>cglib</artifactId>
+      <version>2.1_3</version>
+    </dependency>
+
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>3.8.2</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.5.6</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-jcl</artifactId>
+      <version>1.5.6</version>
+    </dependency>
+
+  </dependencies>
+
+
+
+  <scm>
+    <connection>scm:svn:https://svn.apache.org/repos/asf/lucene/mahout/tags/mahout-0.1-examples</connection>
+    <developerConnection>scm:svn:https://svn.apache.org/repos/asf/lucene/mahout/tags/mahout-0.1-examples</developerConnection>
+    <url>https://svn.apache.org/repos/asf/lucene/mahout/mahout-examples</url>
+  </scm>
+</project>

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,207 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
+import org.apache.mahout.utils.vectors.lucene.LuceneIteratable;
+import org.apache.mahout.utils.vectors.lucene.TFDFMapper;
+import org.apache.mahout.utils.vectors.lucene.VectorMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+
+
+/**
+ *
+ *
+ **/
+public class Driver {
+  private transient static Logger log = LoggerFactory.getLogger(Driver.class);
+  //TODO: This assumes LuceneIterable, make it generic.
+  
+  public static void main(String[] args) throws IOException {
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
+            abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Lucene directory").withShortName("d").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The output file").withShortName("o").create();
+
+    Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
+            abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
+            withDescription("The field in the index").withShortName("f").create();
+
+    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
+            abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
+            withDescription("The field in the index containing the index.  If null, then the Lucene internal doc " +
+                    "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+    
+    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
+            abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
+            withDescription("The output of the dictionary").withShortName("t").create();
+
+    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
+            abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
+            withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
+    Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
+            abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).
+            withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  " +
+                    "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
+    Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+            abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+            withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName("m").create();
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
+            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).create();
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+
+      if (cmdLine.hasOption(helpOpt)) {
+
+        printHelp(group);
+        return;
+      }
+      //Springify all this
+      if (cmdLine.hasOption(inputOpt)) {//Lucene case
+        File file = new File(cmdLine.getValue(inputOpt).toString());
+        if (file.exists() && file.isDirectory()) {
+          int maxDocs = Integer.MAX_VALUE;
+          if (cmdLine.hasOption(maxOpt)) {
+            maxDocs = Integer.parseInt(cmdLine.getValue(maxOpt).toString());
+          }
+          if (maxDocs < 0) {
+            throw new IllegalArgumentException("maxDocs must be >= 0");
+          }
+          Directory dir = FSDirectory.open(file);
+          IndexReader reader = IndexReader.open(dir, true);
+          Weight weight = new TFIDF();
+          String field = cmdLine.getValue(fieldOpt).toString();
+          TermInfo termInfo = new CachedTermInfo(reader, field, 1, 99);
+          VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+          LuceneIteratable iteratable = null;
+          String power = null;
+          double norm = -1;
+          if (cmdLine.hasOption(powerOpt)) {
+            power = cmdLine.getValue(powerOpt).toString();
+            if (power.equals("INF")) {
+              norm = Double.POSITIVE_INFINITY;
+            } else {
+              norm = Double.parseDouble(power);
+            }
+          }
+          String idField = null;
+          if (cmdLine.hasOption(idFieldOpt)){
+            idField = cmdLine.getValue(idFieldOpt).toString();
+          }
+          if (norm == -1) {
+            iteratable = new LuceneIteratable(reader, idField, field, mapper);
+          } else {
+            iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
+          }
+          File outFile = new File(cmdLine.getValue(outputOpt).toString());
+          log.info("Output File: " + outFile);
+          BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+          int i = 0;
+          for (Vector vector : iteratable) {
+            if (i >= maxDocs){
+              break;
+            }
+            writer.write(vector.asFormatString());
+            writer.write("\n");
+            if (i % 500 == 0) {
+              log.info("i = " + i);
+            }
+            i++;
+          }
+          log.info("Wrote " + i + " vectors");
+          writer.flush();
+          writer.close();
+          // TODO: replace with aa codec
+          File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
+          log.info("Dictionary Output file: " + dictOutFile);
+          writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+          Iterator<TermEntry> entIter = termInfo.getAllEntries();
+          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+          writer.write("input");
+          writer.write(delimiter);
+          writer.write(file.getAbsolutePath());
+          writer.write("\n");
+          writer.write("field");
+          writer.write(delimiter);
+          writer.write(field);
+          writer.write("\n");
+          writer.write("num.terms");
+          writer.write(delimiter);
+          writer.write(String.valueOf(termInfo.totalTerms(field)));
+          writer.write("\n");
+          writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
+          writer.write("\n");
+          while (entIter.hasNext()) {
+            TermEntry entry = entIter.next();
+            writer.write(entry.term);
+            writer.write(delimiter);
+            writer.write(String.valueOf(entry.docFreq));
+            writer.write(delimiter);
+            writer.write(String.valueOf(entry.termIdx));
+            writer.write("\n");
+          }
+          writer.flush();
+          writer.close();
+        }
+      }
+
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      printHelp(group);
+    }
+  }
+
+  private static void printHelp(Group group) {
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.setGroup(group);
+    formatter.print();
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,44 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
+
+
+/**
+ *
+ *
+ **/
+public class TFIDF implements Weight {
+
+  private Similarity sim = new DefaultSimilarity();
+
+  public TFIDF() {
+  }
+
+  public TFIDF(Similarity sim) {
+    this.sim = sim;
+  }
+
+  @Override
+  public double calculate(int tf, int df, int length, int numDocs) {
+    //ignore length
+    return sim.tf(tf) * sim.idf(df, numDocs);
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,34 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public class TermEntry {
+  public String term;
+  public int termIdx;
+  public int docFreq;
+
+  public TermEntry(String term, int termIdx, int docFreq) {
+    this.term = term;
+    this.termIdx = termIdx;
+    this.docFreq = docFreq;
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,32 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+
+/**
+ *
+ *
+ **/
+public interface TermInfo {
+
+  int totalTerms(String field);
+
+  TermEntry getTermEntry(String field, String term);
+
+  Iterator<TermEntry> getAllEntries();
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,27 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.matrix.Vector;
+
+
+/**
+ *
+ *
+ **/
+public interface VectorIterable extends Iterable<Vector>{
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,36 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public interface Weight {
+
+  /**
+   * Experimental
+   *  
+   * @param tf term freq
+   * @param df doc freq
+   * @param length Length of the document
+   * @param numDocs the total number of docs
+   * @return The weight
+   */
+  double calculate(int tf, int df, int length, int numDocs);
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,78 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.TermEntry;
+
+import java.util.Map;
+import java.util.Iterator;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.io.IOException;
+
+
+/**
+ * Caches TermEntries from a single field.  Materializes all values in the TermEnum to memory (much like FieldCache)
+ *
+ **/
+public class CachedTermInfo implements TermInfo {
+
+  Map<String, TermEntry> termEntries;
+  String field;
+  public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
+    this.field = field;
+    TermEnum te = reader.terms(new Term(field, ""));
+    int count = 0;
+    int numDocs = reader.numDocs();
+    double percent = numDocs * maxDfPercent / 100.0;
+    //Should we use a linked hash map so that we no terms are in order?
+    termEntries = new LinkedHashMap<String, TermEntry>();
+    do {
+      Term term = te.term();
+      if (term == null || term.field().equals(field) == false){
+        break;
+      }
+      int df = te.docFreq();
+      if (df < minDf || df > percent){
+        continue;
+      }
+      TermEntry entry = new TermEntry(term.text(), count++, df);
+      termEntries.put(entry.term, entry);
+    } while (te.next());
+    te.close();
+  }
+
+  @Override
+  public int totalTerms(String field) {
+    return termEntries.size();
+  }
+
+  @Override
+  public TermEntry getTermEntry(String field, String term) {
+    if (this.field.equals(field) == false){ return null;}
+    return termEntries.get(term);
+  }
+
+  @Override
+  public Iterator<TermEntry> getAllEntries() {
+    return termEntries.values().iterator();
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,128 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Collections;
+
+
+/**
+ *
+ *
+ **/
+public class LuceneIteratable implements VectorIterable {
+
+
+  private IndexReader indexReader;
+  private String field;
+  private String idField;
+  private FieldSelector idFieldSelector;
+
+  private VectorMapper mapper;
+  private double normPower = -1;
+
+  public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper) {
+    this(reader, idField, field, mapper, 2.0);
+  }
+
+  /**
+   * Produce a LuceneIterable that can create the Vector plus normalize it.
+   * @param reader
+   * @param idField - The Field containing the id.  May be null
+   * @param field The field to use for the Vector
+   * @param mapper
+   * @param normPower
+   */
+  public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
+    this.indexReader = reader;
+    this.idField = idField;
+    this.field = field;
+    this.mapper = mapper;
+    this.normPower = normPower;
+    idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+  }
+
+
+  @Override
+  public Iterator<Vector> iterator() {
+    try {
+      return new TDIterator();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private class TDIterator implements Iterator<Vector> {
+    private TermDocs termDocs;
+
+    private TDIterator() throws IOException {
+      //term docs(null) is a better way of iterating all the docs in Lucene
+      this.termDocs = indexReader.termDocs(null);
+    }
+
+    @Override
+    public boolean hasNext() {
+      try {
+        return termDocs.next();
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    @Override
+    public Vector next() {
+      Vector result = null;
+      int doc = termDocs.doc();
+      //
+      try {
+        indexReader.getTermFreqVector(doc, field, mapper);
+        result = mapper.getVector();
+        if (idField != null) {
+          String id = indexReader.document(doc, idFieldSelector).get(idField);
+          result.setName(id);
+        } else {
+          result.setName(String.valueOf(doc));
+        }
+        if (normPower >= 0){
+          result = result.normalize(normPower);
+        }
+      } catch (IOException e) {
+        //Log?
+        throw new RuntimeException(e);
+      }
+
+      return result;
+    }
+
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+
+  }
+
+
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,77 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.Weight;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+
+
+/**
+ * Not thread-safe
+ */
+public class TFDFMapper extends VectorMapper {
+
+  public static final int DEFAULT_CACHE_SIZE = 256;
+
+  protected IndexReader reader;
+  protected Vector vector;
+
+  protected Weight weight;
+  protected int numTerms;
+  protected TermInfo termInfo;
+  private String field;
+  private int numDocs;
+
+  public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
+    this.reader = reader;
+    this.weight = weight;
+    this.termInfo = termInfo;
+    this.numDocs = reader.numDocs();
+  }
+
+  public Vector getVector() {
+    return vector;
+  }
+
+  @Override
+  public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+    this.field = field;
+    vector = new SparseVector(termInfo.totalTerms(field));
+    this.numTerms = numTerms;
+  }
+
+  @Override
+  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+    TermEntry entry = termInfo.getTermEntry(field, term);
+    vector.setQuick(entry.termIdx, weight.calculate(frequency, entry.docFreq, numTerms, numDocs));
+  }
+
+  @Override
+  public boolean isIgnoringPositions() {
+    return true;
+  }
+
+  @Override
+  public boolean isIgnoringOffsets() {
+    return true;
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,35 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.TermVectorMapper;
+import org.apache.mahout.matrix.Vector;
+
+
+/**
+ * Not thread-safe
+ *
+ **/
+public abstract class VectorMapper extends TermVectorMapper {
+  /**
+   * Can be called after the TermVector has been mapped
+   * @return The {@link org.apache.mahout.matrix.Vector}
+   *
+   * @see #map(String, int, org.apache.lucene.index.TermVectorOffsetInfo[], int[])
+   */
+  public abstract Vector getVector();
+}

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,83 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.utils.vectors.Weight;
+import org.apache.mahout.utils.vectors.TFIDF;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+
+import java.util.Collections;
+
+
+/**
+ *
+ *
+ **/
+public class LuceneIterableTest extends TestCase {
+  protected RAMDirectory directory;
+
+  public static String [] DOCS = {
+        "The quick red fox jumped over the lazy brown dogs.",
+        "Mary had a little lamb whose fleece was white as snow.",
+        "Moby Dick is a story of a whale and a man obsessed.",
+        "The robber wore a black fleece jacket and a baseball cap.",
+        "The English Springer Spaniel is the best of all dogs."
+    };
+
+
+  @Override
+  protected void setUp() throws Exception {
+    directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED);
+    for (int i = 0; i < DOCS.length; i++){
+      Document doc = new Document();
+      Field id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+      doc.add(id);
+      //Store both position and offset information
+      Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
+      doc.add(text);
+      writer.addDocument(doc);
+    }
+    writer.close();
+  }
+
+  public void testIterable() throws Exception {
+    IndexReader reader = IndexReader.open(directory, true);
+    Weight weight = new TFIDF();
+    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+    LuceneIteratable iterable = new LuceneIteratable(reader, "id", "content", mapper);
+
+    //TODO: do something more meaningful here
+    for (Vector vector : iterable) {
+      assertTrue("vector is not an instanceof " + SparseVector.class, vector instanceof SparseVector);
+      assertTrue("vector Size: " + vector.cardinality() + " is not greater than: " + 0, vector.cardinality() > 0);
+    }
+  }
+
+
+}