You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/17 16:22:14 UTC
svn commit: r785618 - in /lucene/mahout/trunk: ./ examples/ maven/ utils/
utils/src/ utils/src/main/ utils/src/main/java/ utils/src/main/java/org/
utils/src/main/java/org/apache/ utils/src/main/java/org/apache/mahout/
utils/src/main/java/org/apache/mah...
Author: gsingers
Date: Wed Jun 17 14:22:08 2009
New Revision: 785618
URL: http://svn.apache.org/viewvc?rev=785618&view=rev
Log:
MAHOUT-126: Commit first iteration of this patch
Added:
lucene/mahout/trunk/utils/ (with props)
lucene/mahout/trunk/utils/pom.xml
lucene/mahout/trunk/utils/src/
lucene/mahout/trunk/utils/src/main/
lucene/mahout/trunk/utils/src/main/java/
lucene/mahout/trunk/utils/src/main/java/org/
lucene/mahout/trunk/utils/src/main/java/org/apache/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java
lucene/mahout/trunk/utils/src/main/resources/
lucene/mahout/trunk/utils/src/test/
lucene/mahout/trunk/utils/src/test/java/
lucene/mahout/trunk/utils/src/test/java/org/
lucene/mahout/trunk/utils/src/test/java/org/apache/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
lucene/mahout/trunk/utils/src/test/resources/
Modified:
lucene/mahout/trunk/examples/build-deprecated.xml
lucene/mahout/trunk/examples/pom.xml
lucene/mahout/trunk/maven/build.xml
lucene/mahout/trunk/pom.xml
Modified: lucene/mahout/trunk/examples/build-deprecated.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/build-deprecated.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/build-deprecated.xml (original)
+++ lucene/mahout/trunk/examples/build-deprecated.xml Wed Jun 17 14:22:08 2009
@@ -262,59 +262,4 @@
<!-- EXAMPLES -->
-
- <property name="working.dir" value="work"/>
- <target name="check-files">
- <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
- <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
- <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
- <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
-
- </target>
-
- <target name="enwiki-files" depends="check-files">
- <mkdir dir="temp"/>
- <antcall target="get-enwiki"/>
- <antcall target="expand-enwiki"/>
- </target>
-
- <target name="get-enwiki" unless="enwiki.exists">
- <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
- dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
- </target>
-
- <target name="expand-enwiki" unless="enwiki.expanded">
- <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
- </target>
-
-
- <target name="get-20news-18828" unless="20news-18828.exists">
- <get src="http://people.csail.mit.edu/jrennie/20Newsgroups/20news-18828.tar.gz"
- dest="temp/20news-18828.tar.gz"/>
-
- </target>
- <target name="expand-20news-18828" unless="20news-18828.expanded">
- <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
- <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
- </target>
-
- <target name="extract-20news-18828" depends="check-files, compile" unless="reuters.extracted">
- <mkdir dir="${working.dir}/20news-18828-collapse"/>
- <java classname="org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups" maxmemory="1024M" fork="true">
- <classpath refid="test.classpath"/>
- <!--
- Input format is:
- inputDir outputDir label Analyzer character set
- -->
- <arg line="-p ${working.dir}/20news-18828/ -o ${working.dir}/20news-18828-collapse -a org.apache.lucene.analysis.standard.StandardAnalyzer -c UTF-8"/>
- </java>
- </target>
-
- <target name="get-files" depends="check-files" description="Get and extract the 20 Newsgroups data">
- <mkdir dir="temp"/>
- <antcall target="get-20news-18828"/>
- <antcall target="expand-20news-18828"/>
- <!--<antcall target="extract-20news-18828"/>-->
- </target>
-
</project>
Modified: lucene/mahout/trunk/examples/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/pom.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/pom.xml (original)
+++ lucene/mahout/trunk/examples/pom.xml Wed Jun 17 14:22:08 2009
@@ -102,7 +102,44 @@
<goal>run</goal>
</goals>
</execution>
+ <execution>
+ <id>get-20news</id>
+ <phase>process-classes</phase>
+ <configuration>
+ <tasks if="get.20news">
+ <ant antfile="../maven/build.xml" target="get-files">
+ <property name="dest" value="${project.build.directory}" />
+ <property name="fullnamever" value="${project.artifactId}-${project.version}" />
+ <property name="core-lib" value="../core/lib" />
+ <property name="shared-lib" value="../lib" />
+ <property name="version" value="${project.version}" />
+ </ant>
+ </tasks>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>get-enwiki</id>
+ <phase>process-classes</phase>
+ <configuration>
+ <tasks if="get.enwiki">
+ <ant antfile="../maven/build.xml" target="enwiki-files">
+ <property name="dest" value="${project.build.directory}" />
+ <property name="fullnamever" value="${project.artifactId}-${project.version}" />
+ <property name="core-lib" value="../core/lib" />
+ <property name="shared-lib" value="../lib" />
+ <property name="version" value="${project.version}" />
+ </ant>
+ </tasks>
+
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
</executions>
</plugin>
Modified: lucene/mahout/trunk/maven/build.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/maven/build.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/maven/build.xml (original)
+++ lucene/mahout/trunk/maven/build.xml Wed Jun 17 14:22:08 2009
@@ -64,5 +64,66 @@
</fileset>
</jar>
</target>
+ <!-- Examples -->
+ <property name="working.dir" value="work"/>
+ <target name="check-files">
+ <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
+ <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
+ <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
+ <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
+
+ </target>
+
+ <target name="enwiki-files" depends="check-files">
+ <mkdir dir="temp"/>
+ <antcall target="get-enwiki"/>
+ <antcall target="expand-enwiki"/>
+ </target>
+
+ <target name="get-enwiki" unless="enwiki.exists">
+ <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
+ dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
+ </target>
+
+ <target name="expand-enwiki" unless="enwiki.expanded">
+ <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
+ </target>
+
+
+ <target name="get-20news-18828" unless="20news-18828.exists">
+ <get src="http://people.csail.mit.edu/jrennie/20Newsgroups/20news-18828.tar.gz"
+ dest="temp/20news-18828.tar.gz"/>
+
+ </target>
+ <target name="expand-20news-18828" unless="20news-18828.expanded">
+ <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
+ <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
+ </target>
+
+ <target name="get-20news" depends="check-files">
+ <antcall target="get-20news-18828"/>
+ <antcall target="expand-20news-18828"/>
+ </target>
+
+ <target name="extract-20news-18828" depends="check-files" unless="reuters.extracted">
+ <mkdir dir="${working.dir}/20news-18828-collapse"/>
+ <java classname="org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups" maxmemory="1024M" fork="true">
+ <classpath refid="maven.test.classpath"/>
+ <!--
+ Input format is:
+ inputDir outputDir label Analyzer character set
+ -->
+ <arg line="-p ${working.dir}/20news-18828/ -o ${working.dir}/20news-18828-collapse -a org.apache.lucene.analysis.standard.StandardAnalyzer -c UTF-8"/>
+ </java>
+ </target>
+
+ <target name="get-files" depends="check-files" description="Get and extract the 20 Newsgroups data">
+ <mkdir dir="temp"/>
+ <antcall target="get-20news"/>
+ <antcall target="enwiki-files"/>
+
+ <!--<antcall target="extract-20news-18828"/>-->
+ </target>
+
</project>
Modified: lucene/mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/pom.xml?rev=785618&r1=785617&r2=785618&view=diff
==============================================================================
--- lucene/mahout/trunk/pom.xml (original)
+++ lucene/mahout/trunk/pom.xml Wed Jun 17 14:22:08 2009
@@ -23,6 +23,7 @@
<module>core</module>
<module>taste-web</module>
<module>examples</module>
+ <module>utils</module>
</modules>
<build>
Propchange: lucene/mahout/trunk/utils/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Jun 17 14:22:08 2009
@@ -0,0 +1,2 @@
+*.iml
+target
Added: lucene/mahout/trunk/utils/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/pom.xml?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/pom.xml (added)
+++ lucene/mahout/trunk/utils/pom.xml Wed Jun 17 14:22:08 2009
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-parent</artifactId>
+ <version>1.0</version>
+ <relativePath>../maven</relativePath>
+ </parent>
+
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-utils</artifactId>
+ <version>0.2-SNAPSHOT</version>
+ <name>Mahout utilities</name>
+ <description>Utilities for preparing content into formats for Mahout.</description>
+
+ <packaging>jar</packaging>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>2.3</version>
+ <configuration>
+ <encoding>UTF-8</encoding>
+ </configuration>
+ <executions>
+ <execution>
+ <id>copy-resources</id>
+ <phase>process-resources</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>
+ ${project.build.directory}/classes/META-INF
+ </outputDirectory>
+ <resources>
+ <resource>
+ <directory>..</directory>
+ <includes>
+ <include>README.txt</include>
+ <include>NOTICE.txt</include>
+ <include>LICENSE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <encoding>UTF-8</encoding>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-dependencies</goal>
+ </goals>
+ <configuration>
+ <!-- configure the plugin here -->
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+
+ </build>
+
+ <dependencies>
+
+ <dependency>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-solrj</artifactId>
+ <version>1.4-SNAPSHOT</version>
+ </dependency>
+ <!-- core test -->
+ <!--<dependency>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>-->
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymock</artifactId>
+ <version>2.4</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymockclassextension</artifactId>
+ <version>2.4</version>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- cglib contains nested dependencies that interfere with easymock,
+ thus the cglib references needs to be below easymock -->
+ <dependency>
+ <groupId>cglib</groupId>
+ <artifactId>cglib</artifactId>
+ <version>2.1_3</version>
+ </dependency>
+
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.2</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.5.6</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-jcl</artifactId>
+ <version>1.5.6</version>
+ </dependency>
+
+ </dependencies>
+
+
+
+ <scm>
+ <connection>scm:svn:https://svn.apache.org/repos/asf/lucene/mahout/tags/mahout-0.1-examples</connection>
+ <developerConnection>scm:svn:https://svn.apache.org/repos/asf/lucene/mahout/tags/mahout-0.1-examples</developerConnection>
+ <url>https://svn.apache.org/repos/asf/lucene/mahout/mahout-examples</url>
+ </scm>
+</project>
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,207 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
+import org.apache.mahout.utils.vectors.lucene.LuceneIteratable;
+import org.apache.mahout.utils.vectors.lucene.TFDFMapper;
+import org.apache.mahout.utils.vectors.lucene.VectorMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+
+
+/**
+ *
+ *
+ **/
+public class Driver {
+ private transient static Logger log = LoggerFactory.getLogger(Driver.class);
+ //TODO: This assumes LuceneIterable, make it generic.
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
+ abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Lucene directory").withShortName("d").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The output file").withShortName("o").create();
+
+ Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
+ abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
+ withDescription("The field in the index").withShortName("f").create();
+
+ Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
+ abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
+ withDescription("The field in the index containing the index. If null, then the Lucene internal doc " +
+ "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+
+ Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
+ abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
+ withDescription("The output of the dictionary").withShortName("t").create();
+
+ Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
+ abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
+ withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
+ Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
+ abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).
+ withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. " +
+ "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
+ Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
+ withDescription("The maximum number of vectors to output. If not specified, then it will loop over all docs").withShortName("m").create();
+ Option helpOpt = obuilder.withLongName("help").
+ withDescription("Print out help").withShortName("h").create();
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
+ .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).create();
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+
+ printHelp(group);
+ return;
+ }
+ //Springify all this
+ if (cmdLine.hasOption(inputOpt)) {//Lucene case
+ File file = new File(cmdLine.getValue(inputOpt).toString());
+ if (file.exists() && file.isDirectory()) {
+ int maxDocs = Integer.MAX_VALUE;
+ if (cmdLine.hasOption(maxOpt)) {
+ maxDocs = Integer.parseInt(cmdLine.getValue(maxOpt).toString());
+ }
+ if (maxDocs < 0) {
+ throw new IllegalArgumentException("maxDocs must be >= 0");
+ }
+ Directory dir = FSDirectory.open(file);
+ IndexReader reader = IndexReader.open(dir, true);
+ Weight weight = new TFIDF();
+ String field = cmdLine.getValue(fieldOpt).toString();
+ TermInfo termInfo = new CachedTermInfo(reader, field, 1, 99);
+ VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+ LuceneIteratable iteratable = null;
+ String power = null;
+ double norm = -1;
+ if (cmdLine.hasOption(powerOpt)) {
+ power = cmdLine.getValue(powerOpt).toString();
+ if (power.equals("INF")) {
+ norm = Double.POSITIVE_INFINITY;
+ } else {
+ norm = Double.parseDouble(power);
+ }
+ }
+ String idField = null;
+ if (cmdLine.hasOption(idFieldOpt)){
+ idField = cmdLine.getValue(idFieldOpt).toString();
+ }
+ if (norm == -1) {
+ iteratable = new LuceneIteratable(reader, idField, field, mapper);
+ } else {
+ iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
+ }
+ File outFile = new File(cmdLine.getValue(outputOpt).toString());
+ log.info("Output File: " + outFile);
+ BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+ int i = 0;
+ for (Vector vector : iteratable) {
+ if (i >= maxDocs){
+ break;
+ }
+ writer.write(vector.asFormatString());
+ writer.write("\n");
+ if (i % 500 == 0) {
+ log.info("i = " + i);
+ }
+ i++;
+ }
+ log.info("Wrote " + i + " vectors");
+ writer.flush();
+ writer.close();
+ // TODO: replace with aa codec
+ File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
+ log.info("Dictionary Output file: " + dictOutFile);
+ writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+ Iterator<TermEntry> entIter = termInfo.getAllEntries();
+ String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+ writer.write("input");
+ writer.write(delimiter);
+ writer.write(file.getAbsolutePath());
+ writer.write("\n");
+ writer.write("field");
+ writer.write(delimiter);
+ writer.write(field);
+ writer.write("\n");
+ writer.write("num.terms");
+ writer.write(delimiter);
+ writer.write(String.valueOf(termInfo.totalTerms(field)));
+ writer.write("\n");
+ writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
+ writer.write("\n");
+ while (entIter.hasNext()) {
+ TermEntry entry = entIter.next();
+ writer.write(entry.term);
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.docFreq));
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.termIdx));
+ writer.write("\n");
+ }
+ writer.flush();
+ writer.close();
+ }
+ }
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ printHelp(group);
+ }
+ }
+
+ private static void printHelp(Group group) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.setGroup(group);
+ formatter.print();
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,44 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
+
+
+/**
+ *
+ *
+ **/
+public class TFIDF implements Weight {
+
+ private Similarity sim = new DefaultSimilarity();
+
+ public TFIDF() {
+ }
+
+ public TFIDF(Similarity sim) {
+ this.sim = sim;
+ }
+
+ @Override
+ public double calculate(int tf, int df, int length, int numDocs) {
+ //ignore length
+ return sim.tf(tf) * sim.idf(df, numDocs);
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,34 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public class TermEntry {
+ public String term;
+ public int termIdx;
+ public int docFreq;
+
+ public TermEntry(String term, int termIdx, int docFreq) {
+ this.term = term;
+ this.termIdx = termIdx;
+ this.docFreq = docFreq;
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,32 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+
+/**
+ *
+ *
+ **/
+public interface TermInfo {
+
+ int totalTerms(String field);
+
+ TermEntry getTermEntry(String field, String term);
+
+ Iterator<TermEntry> getAllEntries();
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorIterable.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,27 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.matrix.Vector;
+
+
+/**
+ *
+ *
+ **/
+public interface VectorIterable extends Iterable<Vector>{
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,36 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+public interface Weight {
+
+ /**
+ * Experimental
+ *
+ * @param tf term freq
+ * @param df doc freq
+ * @param length Length of the document
+ * @param numDocs the total number of docs
+ * @return The weight
+ */
+ double calculate(int tf, int df, int length, int numDocs);
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,78 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.TermEntry;
+
+import java.util.Map;
+import java.util.Iterator;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.io.IOException;
+
+
+/**
+ * Caches TermEntries from a single field. Materializes all values in the TermEnum to memory (much like FieldCache)
+ *
+ **/
+public class CachedTermInfo implements TermInfo {
+
+ Map<String, TermEntry> termEntries;
+ String field;
+ public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
+ this.field = field;
+ TermEnum te = reader.terms(new Term(field, ""));
+ int count = 0;
+ int numDocs = reader.numDocs();
+ double percent = numDocs * maxDfPercent / 100.0;
+ //Should we use a linked hash map so that we no terms are in order?
+ termEntries = new LinkedHashMap<String, TermEntry>();
+ do {
+ Term term = te.term();
+ if (term == null || term.field().equals(field) == false){
+ break;
+ }
+ int df = te.docFreq();
+ if (df < minDf || df > percent){
+ continue;
+ }
+ TermEntry entry = new TermEntry(term.text(), count++, df);
+ termEntries.put(entry.term, entry);
+ } while (te.next());
+ te.close();
+ }
+
+ @Override
+ public int totalTerms(String field) {
+ return termEntries.size();
+ }
+
+ @Override
+ public TermEntry getTermEntry(String field, String term) {
+ if (this.field.equals(field) == false){ return null;}
+ return termEntries.get(term);
+ }
+
+ @Override
+ public Iterator<TermEntry> getAllEntries() {
+ return termEntries.values().iterator();
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIteratable.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,128 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Collections;
+
+
+/**
+ *
+ *
+ **/
+public class LuceneIteratable implements VectorIterable {
+
+
+ private IndexReader indexReader;
+ private String field;
+ private String idField;
+ private FieldSelector idFieldSelector;
+
+ private VectorMapper mapper;
+ private double normPower = -1;
+
+ public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper) {
+ this(reader, idField, field, mapper, 2.0);
+ }
+
+ /**
+ * Produce a LuceneIterable that can create the Vector plus normalize it.
+ * @param reader
+ * @param idField - The Field containing the id. May be null
+ * @param field The field to use for the Vector
+ * @param mapper
+ * @param normPower
+ */
+ public LuceneIteratable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
+ this.indexReader = reader;
+ this.idField = idField;
+ this.field = field;
+ this.mapper = mapper;
+ this.normPower = normPower;
+ idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+ }
+
+
+ @Override
+ public Iterator<Vector> iterator() {
+ try {
+ return new TDIterator();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private class TDIterator implements Iterator<Vector> {
+ private TermDocs termDocs;
+
+ private TDIterator() throws IOException {
+ //term docs(null) is a better way of iterating all the docs in Lucene
+ this.termDocs = indexReader.termDocs(null);
+ }
+
+ @Override
+ public boolean hasNext() {
+ try {
+ return termDocs.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public Vector next() {
+ Vector result = null;
+ int doc = termDocs.doc();
+ //
+ try {
+ indexReader.getTermFreqVector(doc, field, mapper);
+ result = mapper.getVector();
+ if (idField != null) {
+ String id = indexReader.document(doc, idFieldSelector).get(idField);
+ result.setName(id);
+ } else {
+ result.setName(String.valueOf(doc));
+ }
+ if (normPower >= 0){
+ result = result.normalize(normPower);
+ }
+ } catch (IOException e) {
+ //Log?
+ throw new RuntimeException(e);
+ }
+
+ return result;
+ }
+
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,77 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.Weight;
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+
+
+/**
+ * Not thread-safe
+ */
+public class TFDFMapper extends VectorMapper {
+
+ public static final int DEFAULT_CACHE_SIZE = 256;
+
+ protected IndexReader reader;
+ protected Vector vector;
+
+ protected Weight weight;
+ protected int numTerms;
+ protected TermInfo termInfo;
+ private String field;
+ private int numDocs;
+
+ public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
+ this.reader = reader;
+ this.weight = weight;
+ this.termInfo = termInfo;
+ this.numDocs = reader.numDocs();
+ }
+
+ public Vector getVector() {
+ return vector;
+ }
+
+ @Override
+ public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+ this.field = field;
+ vector = new SparseVector(termInfo.totalTerms(field));
+ this.numTerms = numTerms;
+ }
+
+ @Override
+ public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+ TermEntry entry = termInfo.getTermEntry(field, term);
+ vector.setQuick(entry.termIdx, weight.calculate(frequency, entry.docFreq, numTerms, numDocs));
+ }
+
+ @Override
+ public boolean isIgnoringPositions() {
+ return true;
+ }
+
+ @Override
+ public boolean isIgnoringOffsets() {
+ return true;
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/VectorMapper.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,35 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.TermVectorMapper;
+import org.apache.mahout.matrix.Vector;
+
+
+/**
+ * Not thread-safe
+ *
+ **/
+public abstract class VectorMapper extends TermVectorMapper {
+ /**
+ * Can be called after the TermVector has been mapped
+ * @return The {@link org.apache.mahout.matrix.Vector}
+ *
+ * @see #map(String, int, org.apache.lucene.index.TermVectorOffsetInfo[], int[])
+ */
+ public abstract Vector getVector();
+}
Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=785618&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Wed Jun 17 14:22:08 2009
@@ -0,0 +1,83 @@
+package org.apache.mahout.utils.vectors.lucene;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.utils.vectors.Weight;
+import org.apache.mahout.utils.vectors.TFIDF;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+
+import java.util.Collections;
+
+
+/**
+ *
+ *
+ **/
+public class LuceneIterableTest extends TestCase {
+ protected RAMDirectory directory;
+
+ public static String [] DOCS = {
+ "The quick red fox jumped over the lazy brown dogs.",
+ "Mary had a little lamb whose fleece was white as snow.",
+ "Moby Dick is a story of a whale and a man obsessed.",
+ "The robber wore a black fleece jacket and a baseball cap.",
+ "The English Springer Spaniel is the best of all dogs."
+ };
+
+
+ @Override
+ protected void setUp() throws Exception {
+ directory = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED);
+ for (int i = 0; i < DOCS.length; i++){
+ Document doc = new Document();
+ Field id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(id);
+ //Store both position and offset information
+ Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
+ doc.add(text);
+ writer.addDocument(doc);
+ }
+ writer.close();
+ }
+
+ public void testIterable() throws Exception {
+ IndexReader reader = IndexReader.open(directory, true);
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+ VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+ LuceneIteratable iterable = new LuceneIteratable(reader, "id", "content", mapper);
+
+ //TODO: do something more meaningful here
+ for (Vector vector : iterable) {
+ assertTrue("vector is not an instanceof " + SparseVector.class, vector instanceof SparseVector);
+ assertTrue("vector Size: " + vector.cardinality() + " is not greater than: " + 0, vector.cardinality() > 0);
+ }
+ }
+
+
+}