You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by Dmitriy Lyubimov <dl...@gmail.com> on 2011/03/26 02:12:10 UTC
Re: svn commit: r1085397 - in /mahout/trunk/utils: ./
src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/
src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs
That would be a typicall change i am trying to fix with 622:
http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
On Fri, Mar 25, 2011 at 7:28 AM, <gs...@apache.org> wrote:
> Author: gsingers
> Date: Fri Mar 25 14:28:12 2011
> New Revision: 1085397
>
> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
> Log:
> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>
> Added:
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
> Modified:
> mahout/trunk/utils/pom.xml
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>
> Modified: mahout/trunk/utils/pom.xml
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/pom.xml (original)
> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
> @@ -142,6 +142,11 @@
> <type>test-jar</type>
> <scope>test</scope>
> </dependency>
> + <dependency>
> + <groupId>org.apache.solr</groupId>
> + <artifactId>solr-commons-csv</artifactId>
> + <version>1.4.1</version>
> + </dependency>
>
> <dependency>
> <groupId>junit</groupId>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
> @@ -77,16 +77,22 @@ public final class VectorDumper {
> Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
> abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
> "The dictionary file type (text|sequencefile)").withShortName("dt").create();
> - Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
> - "Output the centroid as JSON. Otherwise it substitutes in the terms for vector cell entries")
> + Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
> + "Output the Vector as JSON. Otherwise it substitutes in the terms for vector cell entries")
> .withShortName("j").create();
> + Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
> + "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries")
> + .withShortName("c").create();
> + Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
> + "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
> + .withShortName("n").create();
> Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
> withDescription("Dump only the size of the vector").withShortName("sz").create();
> Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
> .create();
>
> Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
> - dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
> + dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
> printKeyOpt).withOption(sizeOpt).create();
>
> try {
> @@ -122,10 +128,12 @@ public final class VectorDumper {
> throw new OptionException(dictTypeOpt);
> }
> }
> - boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
> + boolean useJSON = cmdLine.hasOption(jsonOpt);
> + boolean useCSV = cmdLine.hasOption(csvOpt);
> +
> boolean sizeOnly = cmdLine.hasOption(sizeOpt);
> SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
> -
> + boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
> Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
> Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
> boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
> @@ -140,6 +148,16 @@ public final class VectorDumper {
> try {
> boolean printKey = cmdLine.hasOption(printKeyOpt);
> long i = 0;
> + if (useCSV && dictionary != null){
> + writer.write("#");
> + for (int j = 0; j < dictionary.length; j++) {
> + writer.write(dictionary[j]);
> + if (j < dictionary.length - 1){
> + writer.write(',');
> + }
> + }
> + writer.write('\n');
> + }
> while (reader.next(keyWritable, valueWritable)) {
> if (printKey) {
> Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
> @@ -159,7 +177,14 @@ public final class VectorDumper {
> writer.write(String.valueOf(vector.size()));
> writer.write('\n');
> } else {
> - String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
> + String fmtStr;
> + if (useJSON){
> + fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
> + } else if (useCSV){
> + fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
> + } else {
> + fmtStr = vector.asFormatString();
> + }
> writer.write(fmtStr);
> writer.write('\n');
> }
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
> public final class VectorHelper {
>
> private static final Pattern TAB_PATTERN = Pattern.compile("\t");
> +
>
> private VectorHelper() { }
> -
> +
> + public static String vectorToCSVString(Vector vector, boolean namesAsComments){
> + StringBuilder bldr = new StringBuilder(2048);
> + try {
> + vectorToCSVString(vector, namesAsComments, bldr);
> + } catch (IOException e) {
> + throw new RuntimeException(e);
> + }
> + return bldr.toString();
> + }
> +
> + public static void vectorToCSVString(Vector vector, boolean namesAsComments,
> + Appendable bldr) throws IOException {
> + if (namesAsComments && vector instanceof NamedVector){
> + bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
> + }
> + Iterator<Vector.Element> iter = vector.iterator();
> + boolean first = true;
> + while (iter.hasNext()) {
> + if (first) {
> + first = false;
> + } else {
> + bldr.append(",");
> + }
> + Vector.Element elt = iter.next();
> + bldr.append(String.valueOf(elt.get()));
> + }
> + bldr.append('\n');
> + }
> +
> +
> /**
> * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
> * each the ith entry is the term for the ith vector cell.
> */
> - public static String vectorToString(Vector vector, String[] dictionary) {
> + public static String vectorToJSONString(Vector vector, String[] dictionary) {
> StringBuilder bldr = new StringBuilder(2048);
>
> if (vector instanceof NamedVector) {
> @@ -67,12 +98,13 @@ public final class VectorHelper {
> if (dictionary != null) {
> bldr.append(dictionary[elt.index()]);
> } else {
> - bldr.append(elt.index());
> + bldr.append(String.valueOf(elt.index()));
> }
> - bldr.append(':').append(elt.get());
> + bldr.append(':').append(String.valueOf(elt.get()));
> }
> return bldr.append('}').toString();
> }
> +
>
> /**
> * Read in a dictionary file. Format is:
>
> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,94 @@
> +package org.apache.mahout.utils.vectors.csv;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.commons.csv.CSVParser;
> +import org.apache.commons.csv.CSVStrategy;
> +import org.apache.mahout.math.DenseVector;
> +import org.apache.mahout.math.Vector;
> +
> +import java.io.BufferedReader;
> +import java.io.IOException;
> +import java.io.Reader;
> +import java.util.Iterator;
> +
> +
> +/**
> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
> + * <br/>
> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
> + * <p/>
> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
> + * <p/>
> + * The Iterator is not thread-safe.
> + *
> + *
> + **/
> +public class CSVVectorIterable implements Iterable<Vector> {
> + protected CSVParser parser;
> + protected String [] line;
> +
> + public CSVVectorIterable(Reader reader) throws IOException {
> + parser = new CSVParser(reader);
> + line = parser.getLine();
> + }
> +
> + public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
> + parser = new CSVParser(reader, strategy);
> + line = parser.getLine();
> + }
> +
> +
> + @Override
> + public Iterator<Vector> iterator() {
> + return new CSVIterator();
> + }
> +
> + private class CSVIterator implements Iterator<Vector>{
> +
> +
> + public CSVIterator() {
> + }
> +
> + @Override
> + public boolean hasNext() {
> + return line != null;
> + }
> +
> + @Override
> + public Vector next() {
> +
> + Vector result = null;
> + result = new DenseVector(line.length);
> + for (int i = 0; i < line.length; i++) {
> + result.setQuick(i, Double.parseDouble(line[i]));
> + }
> + //move the line forward
> + try {
> + line = parser.getLine();
> + } catch (IOException e) {
> + throw new RuntimeException(e);
> + }
> + return result;
> + }
> +
> + @Override
> + public void remove() {
> + throw new UnsupportedOperationException();
> + }
> + }
> +}
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
> /**
> * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
> */
> -public class JWriterVectorWriter implements VectorWriter {
> - private final Writer writer;
> +public class JWriterVectorWriter extends VectorWriter {
> + protected final Writer writer;
>
> public JWriterVectorWriter(Writer writer) {
> this.writer = writer;
> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
> if (result >= maxDocs) {
> break;
> }
> - writer.write(vector.asFormatString());
> - writer.write('\n');
> -
> + formatVector(vector);
> result++;
> }
> return result;
> }
> -
> +
> + protected void formatVector(Vector vector) throws IOException {
> + writer.write(vector.asFormatString());
> + writer.write('\n');
> + }
> +
> + @Override
> + public void write(Vector vector) throws IOException {
> + formatVector(vector);
> + }
> +
> @Override
> public void close() throws IOException {
> writer.flush();
>
> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,105 @@
> +package org.apache.mahout.utils.vectors.io;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.ContentSummary;
> +import org.apache.hadoop.fs.FileSystem;
> +import org.apache.hadoop.fs.Path;
> +import org.apache.hadoop.io.SequenceFile;
> +import org.apache.hadoop.io.Writable;
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.math.VectorWritable;
> +
> +import java.io.IOException;
> +import java.util.Iterator;
> +
> +
> +/**
> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
> + *
> + **/
> +public class SequenceFileVectorIterable implements Iterable<Vector>{
> + protected SequenceFile.Reader reader;
> + protected long fileLen;
> + protected Writable keyWritable;
> + protected Writable valueWritable;
> + protected boolean useKey;
> +
> + /**
> + * Construct the Iterable
> + * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
> + * @param file The {@link org.apache.hadoop.fs.Path} containing the file
> + * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
> + * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
> + * @throws IllegalAccessException
> + * @throws InstantiationException
> + * @throws IOException
> + */
> + public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
> + this.reader = new SequenceFile.Reader(fs, file, conf);
> + ContentSummary summary = fs.getContentSummary(file);
> + fileLen = summary.getLength();
> + this.useKey = useKey;
> + keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
> + valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
> + }
> +
> + /**
> + * The Iterator returned does not support remove()
> + * @return The {@link java.util.Iterator}
> + */
> + public Iterator<Vector> iterator() {
> + return new SFIterator();
> +
> + }
> +
> + private final class SFIterator implements Iterator<Vector>{
> + @Override
> + public boolean hasNext() {
> + //TODO: is this legitimate? We can't call next here since it breaks the iterator contract
> + try {
> + return reader.getPosition() < fileLen;
> + } catch (IOException e) {
> + return false;
> + }
> + }
> +
> + @Override
> + public Vector next() {
> + Vector result = null;
> + boolean valid = false;
> + try {
> + valid = reader.next(keyWritable, valueWritable);
> + if (valid){
> + result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
> + }
> + } catch (IOException e) {
> + throw new RuntimeException(e);
> + }
> +
> + return result;
> + }
> +
> + /**
> + * Not supported
> + */
> + public void remove() {
> + throw new UnsupportedOperationException();
> + }
> + }
> +}
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
> *
> * Closes the writer when done
> */
> -public class SequenceFileVectorWriter implements VectorWriter {
> +public class SequenceFileVectorWriter extends VectorWriter {
> private final SequenceFile.Writer writer;
> -
> + long recNum = 0;
> public SequenceFileVectorWriter(SequenceFile.Writer writer) {
> this.writer = writer;
> }
>
> @Override
> public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
> - long recNum = 0;
> +
> for (Vector point : iterable) {
> if (recNum >= maxDocs) {
> break;
> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
> }
> return recNum;
> }
> -
> +
> + @Override
> + public void write(Vector vector) throws IOException {
> + writer.append(new LongWritable(recNum++), new VectorWritable(vector));
> +
> + }
> +
> @Override
> public long write(Iterable<Vector> iterable) throws IOException {
> return write(iterable, Long.MAX_VALUE);
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -21,7 +21,7 @@ import java.io.IOException;
>
> import org.apache.mahout.math.Vector;
>
> -public interface VectorWriter {
> +public abstract class VectorWriter {
> /**
> * Write all values in the Iterable to the output
> * @param iterable The {@link Iterable} to loop over
> @@ -29,7 +29,15 @@ public interface VectorWriter {
> * @throws IOException if there was a problem writing
> *
> */
> - long write(Iterable<Vector> iterable) throws IOException;
> + public abstract long write(Iterable<Vector> iterable) throws IOException;
> +
> + /**
> + * Write out a vector
> + *
> + * @param vector The {@link org.apache.mahout.math.Vector} to write
> + * @throws IOException
> + */
> + public abstract void write(Vector vector) throws IOException;
>
> /**
> * Write the first <code>maxDocs</code> to the output.
> @@ -38,12 +46,12 @@ public interface VectorWriter {
> * @return The number of docs written
> * @throws IOException if there was a problem writing
> */
> - long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
> + public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>
> /**
> * Close any internally held resources. If external Writers are passed in, the implementation should indicate
> * whether it also closes them
> * @throws IOException if there was an issue closing the item
> */
> - void close() throws IOException;
> + public abstract void close() throws IOException;
> }
>
> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,60 @@
> +package org.apache.mahout.utils.vectors.csv;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.utils.MahoutTestCase;
> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
> +import org.apache.mahout.utils.vectors.VectorHelper;
> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
> +import org.junit.Test;
> +
> +import java.io.IOException;
> +import java.io.StringReader;
> +import java.io.StringWriter;
> +
> +
> +/**
> + *
> + *
> + **/
> +public class CSVVectorIterableTest extends MahoutTestCase {
> +
> +
> + @Test
> + public void test() throws Exception {
> +
> + StringWriter sWriter = new StringWriter();
> + JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
> +
> + protected void formatVector(Vector vector) throws IOException {
> + String vecStr = VectorHelper.vectorToCSVString(vector, false);
> + writer.write(vecStr);
> + }
> + };
> + Iterable<Vector> iter = new RandomVectorIterable(50);
> + jwvw.write(iter);
> + jwvw.close();
> + CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
> + int count = 0;
> + for (Vector vector : csvIter) {
> + //System.out.println("Vec: " + vector);
> + count++;
> + }
> + assertEquals(50, count);
> + }
> +}
>
> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,39 @@
> +package org.apache.mahout.utils.vectors.io;
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.FileSystem;
> +import org.apache.hadoop.fs.Path;
> +import org.apache.hadoop.io.LongWritable;
> +import org.apache.hadoop.io.SequenceFile;
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.math.VectorWritable;
> +import org.apache.mahout.utils.MahoutTestCase;
> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
> +import org.junit.Test;
> +
> +
> +/**
> + *
> + *
> + **/
> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
> +
> +
> + @Test
> + public void testSFVI() throws Exception {
> + Path path = getTestTempFilePath("sfvw");
> + Configuration conf = new Configuration();
> + FileSystem fs = FileSystem.get(conf);
> + SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
> + SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
> + Iterable<Vector> iter = new RandomVectorIterable(50);
> + writer.write(iter);
> + writer.close();
> + SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
> + int count = 0;
> + for (Vector vector : sfVIter) {
> + count++;
> + }
> + assertEquals(50, count);
> + }
> +}
>
>
>
Re: svn commit: r1085397 - in /mahout/trunk/utils: ./
src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/
src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs
Posted by Dmitriy Lyubimov <dl...@gmail.com>.
No prob. was too late to comment :) i will tackle it in one of 622 patches :)
On Sat, Mar 26, 2011 at 4:03 AM, Grant Ingersoll <gs...@apache.org> wrote:
> Ah, OK. Good to know. Hadn't followed that one. Feel free to change as appropriate or I can.
>
> On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote:
>
>> That would be a typicall change i am trying to fix with 622:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
>>
>>
>>
>> On Fri, Mar 25, 2011 at 7:28 AM, <gs...@apache.org> wrote:
>>> Author: gsingers
>>> Date: Fri Mar 25 14:28:12 2011
>>> New Revision: 1085397
>>>
>>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
>>> Log:
>>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>>>
>>> Added:
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> Modified:
>>> mahout/trunk/utils/pom.xml
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>>
>>> Modified: mahout/trunk/utils/pom.xml
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/pom.xml (original)
>>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
>>> @@ -142,6 +142,11 @@
>>> <type>test-jar</type>
>>> <scope>test</scope>
>>> </dependency>
>>> + <dependency>
>>> + <groupId>org.apache.solr</groupId>
>>> + <artifactId>solr-commons-csv</artifactId>
>>> + <version>1.4.1</version>
>>> + </dependency>
>>>
>>> <dependency>
>>> <groupId>junit</groupId>
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
>>> @@ -77,16 +77,22 @@ public final class VectorDumper {
>>> Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>>> abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>>> "The dictionary file type (text|sequencefile)").withShortName("dt").create();
>>> - Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>>> - "Output the centroid as JSON. Otherwise it substitutes in the terms for vector cell entries")
>>> + Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>>> + "Output the Vector as JSON. Otherwise it substitutes in the terms for vector cell entries")
>>> .withShortName("j").create();
>>> + Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
>>> + "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries")
>>> + .withShortName("c").create();
>>> + Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
>>> + "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
>>> + .withShortName("n").create();
>>> Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>>> withDescription("Dump only the size of the vector").withShortName("sz").create();
>>> Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>>> .create();
>>>
>>> Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
>>> - dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
>>> + dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>>> printKeyOpt).withOption(sizeOpt).create();
>>>
>>> try {
>>> @@ -122,10 +128,12 @@ public final class VectorDumper {
>>> throw new OptionException(dictTypeOpt);
>>> }
>>> }
>>> - boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
>>> + boolean useJSON = cmdLine.hasOption(jsonOpt);
>>> + boolean useCSV = cmdLine.hasOption(csvOpt);
>>> +
>>> boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>>> SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
>>> -
>>> + boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>>> Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>> Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>>> boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
>>> @@ -140,6 +148,16 @@ public final class VectorDumper {
>>> try {
>>> boolean printKey = cmdLine.hasOption(printKeyOpt);
>>> long i = 0;
>>> + if (useCSV && dictionary != null){
>>> + writer.write("#");
>>> + for (int j = 0; j < dictionary.length; j++) {
>>> + writer.write(dictionary[j]);
>>> + if (j < dictionary.length - 1){
>>> + writer.write(',');
>>> + }
>>> + }
>>> + writer.write('\n');
>>> + }
>>> while (reader.next(keyWritable, valueWritable)) {
>>> if (printKey) {
>>> Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
>>> @@ -159,7 +177,14 @@ public final class VectorDumper {
>>> writer.write(String.valueOf(vector.size()));
>>> writer.write('\n');
>>> } else {
>>> - String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
>>> + String fmtStr;
>>> + if (useJSON){
>>> + fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
>>> + } else if (useCSV){
>>> + fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
>>> + } else {
>>> + fmtStr = vector.asFormatString();
>>> + }
>>> writer.write(fmtStr);
>>> writer.write('\n');
>>> }
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
>>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>>> public final class VectorHelper {
>>>
>>> private static final Pattern TAB_PATTERN = Pattern.compile("\t");
>>> +
>>>
>>> private VectorHelper() { }
>>> -
>>> +
>>> + public static String vectorToCSVString(Vector vector, boolean namesAsComments){
>>> + StringBuilder bldr = new StringBuilder(2048);
>>> + try {
>>> + vectorToCSVString(vector, namesAsComments, bldr);
>>> + } catch (IOException e) {
>>> + throw new RuntimeException(e);
>>> + }
>>> + return bldr.toString();
>>> + }
>>> +
>>> + public static void vectorToCSVString(Vector vector, boolean namesAsComments,
>>> + Appendable bldr) throws IOException {
>>> + if (namesAsComments && vector instanceof NamedVector){
>>> + bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
>>> + }
>>> + Iterator<Vector.Element> iter = vector.iterator();
>>> + boolean first = true;
>>> + while (iter.hasNext()) {
>>> + if (first) {
>>> + first = false;
>>> + } else {
>>> + bldr.append(",");
>>> + }
>>> + Vector.Element elt = iter.next();
>>> + bldr.append(String.valueOf(elt.get()));
>>> + }
>>> + bldr.append('\n');
>>> + }
>>> +
>>> +
>>> /**
>>> * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
>>> * each the ith entry is the term for the ith vector cell.
>>> */
>>> - public static String vectorToString(Vector vector, String[] dictionary) {
>>> + public static String vectorToJSONString(Vector vector, String[] dictionary) {
>>> StringBuilder bldr = new StringBuilder(2048);
>>>
>>> if (vector instanceof NamedVector) {
>>> @@ -67,12 +98,13 @@ public final class VectorHelper {
>>> if (dictionary != null) {
>>> bldr.append(dictionary[elt.index()]);
>>> } else {
>>> - bldr.append(elt.index());
>>> + bldr.append(String.valueOf(elt.index()));
>>> }
>>> - bldr.append(':').append(elt.get());
>>> + bldr.append(':').append(String.valueOf(elt.get()));
>>> }
>>> return bldr.append('}').toString();
>>> }
>>> +
>>>
>>> /**
>>> * Read in a dictionary file. Format is:
>>>
>>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,94 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements. See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License. You may obtain a copy of the License at
>>> + *
>>> + * http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.commons.csv.CSVParser;
>>> +import org.apache.commons.csv.CSVStrategy;
>>> +import org.apache.mahout.math.DenseVector;
>>> +import org.apache.mahout.math.Vector;
>>> +
>>> +import java.io.BufferedReader;
>>> +import java.io.IOException;
>>> +import java.io.Reader;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
>>> + * <br/>
>>> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
>>> + * <p/>
>>> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
>>> + * <p/>
>>> + * The Iterator is not thread-safe.
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterable implements Iterable<Vector> {
>>> + protected CSVParser parser;
>>> + protected String [] line;
>>> +
>>> + public CSVVectorIterable(Reader reader) throws IOException {
>>> + parser = new CSVParser(reader);
>>> + line = parser.getLine();
>>> + }
>>> +
>>> + public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
>>> + parser = new CSVParser(reader, strategy);
>>> + line = parser.getLine();
>>> + }
>>> +
>>> +
>>> + @Override
>>> + public Iterator<Vector> iterator() {
>>> + return new CSVIterator();
>>> + }
>>> +
>>> + private class CSVIterator implements Iterator<Vector>{
>>> +
>>> +
>>> + public CSVIterator() {
>>> + }
>>> +
>>> + @Override
>>> + public boolean hasNext() {
>>> + return line != null;
>>> + }
>>> +
>>> + @Override
>>> + public Vector next() {
>>> +
>>> + Vector result = null;
>>> + result = new DenseVector(line.length);
>>> + for (int i = 0; i < line.length; i++) {
>>> + result.setQuick(i, Double.parseDouble(line[i]));
>>> + }
>>> + //move the line forward
>>> + try {
>>> + line = parser.getLine();
>>> + } catch (IOException e) {
>>> + throw new RuntimeException(e);
>>> + }
>>> + return result;
>>> + }
>>> +
>>> + @Override
>>> + public void remove() {
>>> + throw new UnsupportedOperationException();
>>> + }
>>> + }
>>> +}
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>>> /**
>>> * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
>>> */
>>> -public class JWriterVectorWriter implements VectorWriter {
>>> - private final Writer writer;
>>> +public class JWriterVectorWriter extends VectorWriter {
>>> + protected final Writer writer;
>>>
>>> public JWriterVectorWriter(Writer writer) {
>>> this.writer = writer;
>>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>>> if (result >= maxDocs) {
>>> break;
>>> }
>>> - writer.write(vector.asFormatString());
>>> - writer.write('\n');
>>> -
>>> + formatVector(vector);
>>> result++;
>>> }
>>> return result;
>>> }
>>> -
>>> +
>>> + protected void formatVector(Vector vector) throws IOException {
>>> + writer.write(vector.asFormatString());
>>> + writer.write('\n');
>>> + }
>>> +
>>> + @Override
>>> + public void write(Vector vector) throws IOException {
>>> + formatVector(vector);
>>> + }
>>> +
>>> @Override
>>> public void close() throws IOException {
>>> writer.flush();
>>>
>>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,105 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements. See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License. You may obtain a copy of the License at
>>> + *
>>> + * http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.ContentSummary;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.hadoop.io.Writable;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +
>>> +import java.io.IOException;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterable implements Iterable<Vector>{
>>> + protected SequenceFile.Reader reader;
>>> + protected long fileLen;
>>> + protected Writable keyWritable;
>>> + protected Writable valueWritable;
>>> + protected boolean useKey;
>>> +
>>> + /**
>>> + * Construct the Iterable
>>> + * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
>>> + * @param file The {@link org.apache.hadoop.fs.Path} containing the file
>>> + * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
>>> + * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
>>> + * @throws IllegalAccessException
>>> + * @throws InstantiationException
>>> + * @throws IOException
>>> + */
>>> + public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
>>> + this.reader = new SequenceFile.Reader(fs, file, conf);
>>> + ContentSummary summary = fs.getContentSummary(file);
>>> + fileLen = summary.getLength();
>>> + this.useKey = useKey;
>>> + keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>> + valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>>> + }
>>> +
>>> + /**
>>> + * The Iterator returned does not support remove()
>>> + * @return The {@link java.util.Iterator}
>>> + */
>>> + public Iterator<Vector> iterator() {
>>> + return new SFIterator();
>>> +
>>> + }
>>> +
>>> + private final class SFIterator implements Iterator<Vector>{
>>> + @Override
>>> + public boolean hasNext() {
>>> + //TODO: is this legitimate? We can't call next here since it breaks the iterator contract
>>> + try {
>>> + return reader.getPosition() < fileLen;
>>> + } catch (IOException e) {
>>> + return false;
>>> + }
>>> + }
>>> +
>>> + @Override
>>> + public Vector next() {
>>> + Vector result = null;
>>> + boolean valid = false;
>>> + try {
>>> + valid = reader.next(keyWritable, valueWritable);
>>> + if (valid){
>>> + result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
>>> + }
>>> + } catch (IOException e) {
>>> + throw new RuntimeException(e);
>>> + }
>>> +
>>> + return result;
>>> + }
>>> +
>>> + /**
>>> + * Not supported
>>> + */
>>> + public void remove() {
>>> + throw new UnsupportedOperationException();
>>> + }
>>> + }
>>> +}
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>>> *
>>> * Closes the writer when done
>>> */
>>> -public class SequenceFileVectorWriter implements VectorWriter {
>>> +public class SequenceFileVectorWriter extends VectorWriter {
>>> private final SequenceFile.Writer writer;
>>> -
>>> + long recNum = 0;
>>> public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>>> this.writer = writer;
>>> }
>>>
>>> @Override
>>> public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
>>> - long recNum = 0;
>>> +
>>> for (Vector point : iterable) {
>>> if (recNum >= maxDocs) {
>>> break;
>>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>>> }
>>> return recNum;
>>> }
>>> -
>>> +
>>> + @Override
>>> + public void write(Vector vector) throws IOException {
>>> + writer.append(new LongWritable(recNum++), new VectorWritable(vector));
>>> +
>>> + }
>>> +
>>> @Override
>>> public long write(Iterable<Vector> iterable) throws IOException {
>>> return write(iterable, Long.MAX_VALUE);
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -21,7 +21,7 @@ import java.io.IOException;
>>>
>>> import org.apache.mahout.math.Vector;
>>>
>>> -public interface VectorWriter {
>>> +public abstract class VectorWriter {
>>> /**
>>> * Write all values in the Iterable to the output
>>> * @param iterable The {@link Iterable} to loop over
>>> @@ -29,7 +29,15 @@ public interface VectorWriter {
>>> * @throws IOException if there was a problem writing
>>> *
>>> */
>>> - long write(Iterable<Vector> iterable) throws IOException;
>>> + public abstract long write(Iterable<Vector> iterable) throws IOException;
>>> +
>>> + /**
>>> + * Write out a vector
>>> + *
>>> + * @param vector The {@link org.apache.mahout.math.Vector} to write
>>> + * @throws IOException
>>> + */
>>> + public abstract void write(Vector vector) throws IOException;
>>>
>>> /**
>>> * Write the first <code>maxDocs</code> to the output.
>>> @@ -38,12 +46,12 @@ public interface VectorWriter {
>>> * @return The number of docs written
>>> * @throws IOException if there was a problem writing
>>> */
>>> - long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>> + public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>>
>>> /**
>>> * Close any internally held resources. If external Writers are passed in, the implementation should indicate
>>> * whether it also closes them
>>> * @throws IOException if there was an issue closing the item
>>> */
>>> - void close() throws IOException;
>>> + public abstract void close() throws IOException;
>>> }
>>>
>>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
>>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,60 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements. See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License. You may obtain a copy of the License at
>>> + *
>>> + * http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.apache.mahout.utils.vectors.VectorHelper;
>>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
>>> +import org.junit.Test;
>>> +
>>> +import java.io.IOException;
>>> +import java.io.StringReader;
>>> +import java.io.StringWriter;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> + @Test
>>> + public void test() throws Exception {
>>> +
>>> + StringWriter sWriter = new StringWriter();
>>> + JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
>>> +
>>> + protected void formatVector(Vector vector) throws IOException {
>>> + String vecStr = VectorHelper.vectorToCSVString(vector, false);
>>> + writer.write(vecStr);
>>> + }
>>> + };
>>> + Iterable<Vector> iter = new RandomVectorIterable(50);
>>> + jwvw.write(iter);
>>> + jwvw.close();
>>> + CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
>>> + int count = 0;
>>> + for (Vector vector : csvIter) {
>>> + //System.out.println("Vec: " + vector);
>>> + count++;
>>> + }
>>> + assertEquals(50, count);
>>> + }
>>> +}
>>>
>>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
>>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,39 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.LongWritable;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.junit.Test;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> + @Test
>>> + public void testSFVI() throws Exception {
>>> + Path path = getTestTempFilePath("sfvw");
>>> + Configuration conf = new Configuration();
>>> + FileSystem fs = FileSystem.get(conf);
>>> + SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
>>> + SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
>>> + Iterable<Vector> iter = new RandomVectorIterable(50);
>>> + writer.write(iter);
>>> + writer.close();
>>> + SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
>>> + int count = 0;
>>> + for (Vector vector : sfVIter) {
>>> + count++;
>>> + }
>>> + assertEquals(50, count);
>>> + }
>>> +}
>>>
>>>
>>>
>
> --------------------------
> Grant Ingersoll
> http://www.lucidimagination.com/
>
> Search the Lucene ecosystem docs using Solr/Lucene:
> http://www.lucidimagination.com/search
>
>
Re: svn commit: r1085397 - in /mahout/trunk/utils: ./ src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/ src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs
Posted by Grant Ingersoll <gs...@apache.org>.
Ah, OK. Good to know. Hadn't followed that one. Feel free to change as appropriate or I can.
On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote:
> That would be a typicall change i am trying to fix with 622:
> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
>
>
>
> On Fri, Mar 25, 2011 at 7:28 AM, <gs...@apache.org> wrote:
>> Author: gsingers
>> Date: Fri Mar 25 14:28:12 2011
>> New Revision: 1085397
>>
>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
>> Log:
>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>>
>> Added:
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>> Modified:
>> mahout/trunk/utils/pom.xml
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>
>> Modified: mahout/trunk/utils/pom.xml
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/pom.xml (original)
>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
>> @@ -142,6 +142,11 @@
>> <type>test-jar</type>
>> <scope>test</scope>
>> </dependency>
>> + <dependency>
>> + <groupId>org.apache.solr</groupId>
>> + <artifactId>solr-commons-csv</artifactId>
>> + <version>1.4.1</version>
>> + </dependency>
>>
>> <dependency>
>> <groupId>junit</groupId>
>>
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
>> @@ -77,16 +77,22 @@ public final class VectorDumper {
>> Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>> abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>> "The dictionary file type (text|sequencefile)").withShortName("dt").create();
>> - Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>> - "Output the centroid as JSON. Otherwise it substitutes in the terms for vector cell entries")
>> + Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>> + "Output the Vector as JSON. Otherwise it substitutes in the terms for vector cell entries")
>> .withShortName("j").create();
>> + Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
>> + "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries")
>> + .withShortName("c").create();
>> + Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
>> + "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
>> + .withShortName("n").create();
>> Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>> withDescription("Dump only the size of the vector").withShortName("sz").create();
>> Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>> .create();
>>
>> Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
>> - dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
>> + dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>> printKeyOpt).withOption(sizeOpt).create();
>>
>> try {
>> @@ -122,10 +128,12 @@ public final class VectorDumper {
>> throw new OptionException(dictTypeOpt);
>> }
>> }
>> - boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
>> + boolean useJSON = cmdLine.hasOption(jsonOpt);
>> + boolean useCSV = cmdLine.hasOption(csvOpt);
>> +
>> boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>> SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
>> -
>> + boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>> Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>> Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>> boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
>> @@ -140,6 +148,16 @@ public final class VectorDumper {
>> try {
>> boolean printKey = cmdLine.hasOption(printKeyOpt);
>> long i = 0;
>> + if (useCSV && dictionary != null){
>> + writer.write("#");
>> + for (int j = 0; j < dictionary.length; j++) {
>> + writer.write(dictionary[j]);
>> + if (j < dictionary.length - 1){
>> + writer.write(',');
>> + }
>> + }
>> + writer.write('\n');
>> + }
>> while (reader.next(keyWritable, valueWritable)) {
>> if (printKey) {
>> Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
>> @@ -159,7 +177,14 @@ public final class VectorDumper {
>> writer.write(String.valueOf(vector.size()));
>> writer.write('\n');
>> } else {
>> - String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
>> + String fmtStr;
>> + if (useJSON){
>> + fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
>> + } else if (useCSV){
>> + fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
>> + } else {
>> + fmtStr = vector.asFormatString();
>> + }
>> writer.write(fmtStr);
>> writer.write('\n');
>> }
>>
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>> public final class VectorHelper {
>>
>> private static final Pattern TAB_PATTERN = Pattern.compile("\t");
>> +
>>
>> private VectorHelper() { }
>> -
>> +
>> + public static String vectorToCSVString(Vector vector, boolean namesAsComments){
>> + StringBuilder bldr = new StringBuilder(2048);
>> + try {
>> + vectorToCSVString(vector, namesAsComments, bldr);
>> + } catch (IOException e) {
>> + throw new RuntimeException(e);
>> + }
>> + return bldr.toString();
>> + }
>> +
>> + public static void vectorToCSVString(Vector vector, boolean namesAsComments,
>> + Appendable bldr) throws IOException {
>> + if (namesAsComments && vector instanceof NamedVector){
>> + bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
>> + }
>> + Iterator<Vector.Element> iter = vector.iterator();
>> + boolean first = true;
>> + while (iter.hasNext()) {
>> + if (first) {
>> + first = false;
>> + } else {
>> + bldr.append(",");
>> + }
>> + Vector.Element elt = iter.next();
>> + bldr.append(String.valueOf(elt.get()));
>> + }
>> + bldr.append('\n');
>> + }
>> +
>> +
>> /**
>> * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
>> * each the ith entry is the term for the ith vector cell.
>> */
>> - public static String vectorToString(Vector vector, String[] dictionary) {
>> + public static String vectorToJSONString(Vector vector, String[] dictionary) {
>> StringBuilder bldr = new StringBuilder(2048);
>>
>> if (vector instanceof NamedVector) {
>> @@ -67,12 +98,13 @@ public final class VectorHelper {
>> if (dictionary != null) {
>> bldr.append(dictionary[elt.index()]);
>> } else {
>> - bldr.append(elt.index());
>> + bldr.append(String.valueOf(elt.index()));
>> }
>> - bldr.append(':').append(elt.get());
>> + bldr.append(':').append(String.valueOf(elt.get()));
>> }
>> return bldr.append('}').toString();
>> }
>> +
>>
>> /**
>> * Read in a dictionary file. Format is:
>>
>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,94 @@
>> +package org.apache.mahout.utils.vectors.csv;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements. See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License. You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.commons.csv.CSVParser;
>> +import org.apache.commons.csv.CSVStrategy;
>> +import org.apache.mahout.math.DenseVector;
>> +import org.apache.mahout.math.Vector;
>> +
>> +import java.io.BufferedReader;
>> +import java.io.IOException;
>> +import java.io.Reader;
>> +import java.util.Iterator;
>> +
>> +
>> +/**
>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
>> + * <br/>
>> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
>> + * <p/>
>> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
>> + * <p/>
>> + * The Iterator is not thread-safe.
>> + *
>> + *
>> + **/
>> +public class CSVVectorIterable implements Iterable<Vector> {
>> + protected CSVParser parser;
>> + protected String [] line;
>> +
>> + public CSVVectorIterable(Reader reader) throws IOException {
>> + parser = new CSVParser(reader);
>> + line = parser.getLine();
>> + }
>> +
>> + public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
>> + parser = new CSVParser(reader, strategy);
>> + line = parser.getLine();
>> + }
>> +
>> +
>> + @Override
>> + public Iterator<Vector> iterator() {
>> + return new CSVIterator();
>> + }
>> +
>> + private class CSVIterator implements Iterator<Vector>{
>> +
>> +
>> + public CSVIterator() {
>> + }
>> +
>> + @Override
>> + public boolean hasNext() {
>> + return line != null;
>> + }
>> +
>> + @Override
>> + public Vector next() {
>> +
>> + Vector result = null;
>> + result = new DenseVector(line.length);
>> + for (int i = 0; i < line.length; i++) {
>> + result.setQuick(i, Double.parseDouble(line[i]));
>> + }
>> + //move the line forward
>> + try {
>> + line = parser.getLine();
>> + } catch (IOException e) {
>> + throw new RuntimeException(e);
>> + }
>> + return result;
>> + }
>> +
>> + @Override
>> + public void remove() {
>> + throw new UnsupportedOperationException();
>> + }
>> + }
>> +}
>>
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>> /**
>> * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
>> */
>> -public class JWriterVectorWriter implements VectorWriter {
>> - private final Writer writer;
>> +public class JWriterVectorWriter extends VectorWriter {
>> + protected final Writer writer;
>>
>> public JWriterVectorWriter(Writer writer) {
>> this.writer = writer;
>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>> if (result >= maxDocs) {
>> break;
>> }
>> - writer.write(vector.asFormatString());
>> - writer.write('\n');
>> -
>> + formatVector(vector);
>> result++;
>> }
>> return result;
>> }
>> -
>> +
>> + protected void formatVector(Vector vector) throws IOException {
>> + writer.write(vector.asFormatString());
>> + writer.write('\n');
>> + }
>> +
>> + @Override
>> + public void write(Vector vector) throws IOException {
>> + formatVector(vector);
>> + }
>> +
>> @Override
>> public void close() throws IOException {
>> writer.flush();
>>
>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,105 @@
>> +package org.apache.mahout.utils.vectors.io;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements. See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License. You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.ContentSummary;
>> +import org.apache.hadoop.fs.FileSystem;
>> +import org.apache.hadoop.fs.Path;
>> +import org.apache.hadoop.io.SequenceFile;
>> +import org.apache.hadoop.io.Writable;
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.math.VectorWritable;
>> +
>> +import java.io.IOException;
>> +import java.util.Iterator;
>> +
>> +
>> +/**
>> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
>> + *
>> + **/
>> +public class SequenceFileVectorIterable implements Iterable<Vector>{
>> + protected SequenceFile.Reader reader;
>> + protected long fileLen;
>> + protected Writable keyWritable;
>> + protected Writable valueWritable;
>> + protected boolean useKey;
>> +
>> + /**
>> + * Construct the Iterable
>> + * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
>> + * @param file The {@link org.apache.hadoop.fs.Path} containing the file
>> + * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
>> + * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
>> + * @throws IllegalAccessException
>> + * @throws InstantiationException
>> + * @throws IOException
>> + */
>> + public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
>> + this.reader = new SequenceFile.Reader(fs, file, conf);
>> + ContentSummary summary = fs.getContentSummary(file);
>> + fileLen = summary.getLength();
>> + this.useKey = useKey;
>> + keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>> + valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>> + }
>> +
>> + /**
>> + * The Iterator returned does not support remove()
>> + * @return The {@link java.util.Iterator}
>> + */
>> + public Iterator<Vector> iterator() {
>> + return new SFIterator();
>> +
>> + }
>> +
>> + private final class SFIterator implements Iterator<Vector>{
>> + @Override
>> + public boolean hasNext() {
>> + //TODO: is this legitimate? We can't call next here since it breaks the iterator contract
>> + try {
>> + return reader.getPosition() < fileLen;
>> + } catch (IOException e) {
>> + return false;
>> + }
>> + }
>> +
>> + @Override
>> + public Vector next() {
>> + Vector result = null;
>> + boolean valid = false;
>> + try {
>> + valid = reader.next(keyWritable, valueWritable);
>> + if (valid){
>> + result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
>> + }
>> + } catch (IOException e) {
>> + throw new RuntimeException(e);
>> + }
>> +
>> + return result;
>> + }
>> +
>> + /**
>> + * Not supported
>> + */
>> + public void remove() {
>> + throw new UnsupportedOperationException();
>> + }
>> + }
>> +}
>>
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>> *
>> * Closes the writer when done
>> */
>> -public class SequenceFileVectorWriter implements VectorWriter {
>> +public class SequenceFileVectorWriter extends VectorWriter {
>> private final SequenceFile.Writer writer;
>> -
>> + long recNum = 0;
>> public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>> this.writer = writer;
>> }
>>
>> @Override
>> public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
>> - long recNum = 0;
>> +
>> for (Vector point : iterable) {
>> if (recNum >= maxDocs) {
>> break;
>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>> }
>> return recNum;
>> }
>> -
>> +
>> + @Override
>> + public void write(Vector vector) throws IOException {
>> + writer.append(new LongWritable(recNum++), new VectorWritable(vector));
>> +
>> + }
>> +
>> @Override
>> public long write(Iterable<Vector> iterable) throws IOException {
>> return write(iterable, Long.MAX_VALUE);
>>
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -21,7 +21,7 @@ import java.io.IOException;
>>
>> import org.apache.mahout.math.Vector;
>>
>> -public interface VectorWriter {
>> +public abstract class VectorWriter {
>> /**
>> * Write all values in the Iterable to the output
>> * @param iterable The {@link Iterable} to loop over
>> @@ -29,7 +29,15 @@ public interface VectorWriter {
>> * @throws IOException if there was a problem writing
>> *
>> */
>> - long write(Iterable<Vector> iterable) throws IOException;
>> + public abstract long write(Iterable<Vector> iterable) throws IOException;
>> +
>> + /**
>> + * Write out a vector
>> + *
>> + * @param vector The {@link org.apache.mahout.math.Vector} to write
>> + * @throws IOException
>> + */
>> + public abstract void write(Vector vector) throws IOException;
>>
>> /**
>> * Write the first <code>maxDocs</code> to the output.
>> @@ -38,12 +46,12 @@ public interface VectorWriter {
>> * @return The number of docs written
>> * @throws IOException if there was a problem writing
>> */
>> - long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>> + public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>
>> /**
>> * Close any internally held resources. If external Writers are passed in, the implementation should indicate
>> * whether it also closes them
>> * @throws IOException if there was an issue closing the item
>> */
>> - void close() throws IOException;
>> + public abstract void close() throws IOException;
>> }
>>
>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,60 @@
>> +package org.apache.mahout.utils.vectors.csv;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements. See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License. You may obtain a copy of the License at
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.utils.MahoutTestCase;
>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>> +import org.apache.mahout.utils.vectors.VectorHelper;
>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
>> +import org.junit.Test;
>> +
>> +import java.io.IOException;
>> +import java.io.StringReader;
>> +import java.io.StringWriter;
>> +
>> +
>> +/**
>> + *
>> + *
>> + **/
>> +public class CSVVectorIterableTest extends MahoutTestCase {
>> +
>> +
>> + @Test
>> + public void test() throws Exception {
>> +
>> + StringWriter sWriter = new StringWriter();
>> + JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
>> +
>> + protected void formatVector(Vector vector) throws IOException {
>> + String vecStr = VectorHelper.vectorToCSVString(vector, false);
>> + writer.write(vecStr);
>> + }
>> + };
>> + Iterable<Vector> iter = new RandomVectorIterable(50);
>> + jwvw.write(iter);
>> + jwvw.close();
>> + CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
>> + int count = 0;
>> + for (Vector vector : csvIter) {
>> + //System.out.println("Vec: " + vector);
>> + count++;
>> + }
>> + assertEquals(50, count);
>> + }
>> +}
>>
>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,39 @@
>> +package org.apache.mahout.utils.vectors.io;
>> +
>> +import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.FileSystem;
>> +import org.apache.hadoop.fs.Path;
>> +import org.apache.hadoop.io.LongWritable;
>> +import org.apache.hadoop.io.SequenceFile;
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.math.VectorWritable;
>> +import org.apache.mahout.utils.MahoutTestCase;
>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>> +import org.junit.Test;
>> +
>> +
>> +/**
>> + *
>> + *
>> + **/
>> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
>> +
>> +
>> + @Test
>> + public void testSFVI() throws Exception {
>> + Path path = getTestTempFilePath("sfvw");
>> + Configuration conf = new Configuration();
>> + FileSystem fs = FileSystem.get(conf);
>> + SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
>> + SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
>> + Iterable<Vector> iter = new RandomVectorIterable(50);
>> + writer.write(iter);
>> + writer.close();
>> + SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
>> + int count = 0;
>> + for (Vector vector : sfVIter) {
>> + count++;
>> + }
>> + assertEquals(50, count);
>> + }
>> +}
>>
>>
>>
--------------------------
Grant Ingersoll
http://www.lucidimagination.com/
Search the Lucene ecosystem docs using Solr/Lucene:
http://www.lucidimagination.com/search