You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by Dmitriy Lyubimov <dl...@gmail.com> on 2011/03/26 02:12:10 UTC

Re: svn commit: r1085397 - in /mahout/trunk/utils: ./ src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/ src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs

That would be a typicall change i am trying to fix with 622:
http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397



On Fri, Mar 25, 2011 at 7:28 AM,  <gs...@apache.org> wrote:
> Author: gsingers
> Date: Fri Mar 25 14:28:12 2011
> New Revision: 1085397
>
> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
> Log:
> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>
> Added:
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
> Modified:
>    mahout/trunk/utils/pom.xml
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>
> Modified: mahout/trunk/utils/pom.xml
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/pom.xml (original)
> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
> @@ -142,6 +142,11 @@
>       <type>test-jar</type>
>       <scope>test</scope>
>     </dependency>
> +    <dependency>
> +      <groupId>org.apache.solr</groupId>
> +      <artifactId>solr-commons-csv</artifactId>
> +      <version>1.4.1</version>
> +    </dependency>
>
>     <dependency>
>       <groupId>junit</groupId>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
> @@ -77,16 +77,22 @@ public final class VectorDumper {
>     Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>             abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>             "The dictionary file type (text|sequencefile)").withShortName("dt").create();
> -    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
> -            "Output the centroid as JSON.  Otherwise it substitutes in the terms for vector cell entries")
> +    Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
> +            "Output the Vector as JSON.  Otherwise it substitutes in the terms for vector cell entries")
>             .withShortName("j").create();
> +    Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
> +            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries")
> +            .withShortName("c").create();
> +    Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
> +            "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
> +            .withShortName("n").create();
>     Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>             withDescription("Dump only the size of the vector").withShortName("sz").create();
>     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>             .create();
>
>     Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
> -            dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
> +            dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>             printKeyOpt).withOption(sizeOpt).create();
>
>     try {
> @@ -122,10 +128,12 @@ public final class VectorDumper {
>             throw new OptionException(dictTypeOpt);
>           }
>         }
> -        boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
> +        boolean useJSON = cmdLine.hasOption(jsonOpt);
> +        boolean useCSV = cmdLine.hasOption(csvOpt);
> +
>         boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
> -
> +        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>         Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>         Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>         boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
> @@ -140,6 +148,16 @@ public final class VectorDumper {
>           try {
>             boolean printKey = cmdLine.hasOption(printKeyOpt);
>             long i = 0;
> +            if (useCSV && dictionary != null){
> +              writer.write("#");
> +              for (int j = 0; j < dictionary.length; j++) {
> +                writer.write(dictionary[j]);
> +                if (j < dictionary.length - 1){
> +                  writer.write(',');
> +                }
> +              }
> +              writer.write('\n');
> +            }
>             while (reader.next(keyWritable, valueWritable)) {
>               if (printKey) {
>                 Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
> @@ -159,7 +177,14 @@ public final class VectorDumper {
>                 writer.write(String.valueOf(vector.size()));
>                 writer.write('\n');
>               } else {
> -                String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
> +                String fmtStr;
> +                if (useJSON){
> +                  fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
> +                } else if (useCSV){
> +                  fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
> +                } else {
> +                  fmtStr = vector.asFormatString();
> +                }
>                 writer.write(fmtStr);
>                 writer.write('\n');
>               }
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>  public final class VectorHelper {
>
>   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
> +
>
>   private VectorHelper() { }
> -
> +
> +  public static String vectorToCSVString(Vector vector, boolean namesAsComments){
> +    StringBuilder bldr = new StringBuilder(2048);
> +    try {
> +      vectorToCSVString(vector, namesAsComments, bldr);
> +    } catch (IOException e) {
> +      throw new RuntimeException(e);
> +    }
> +    return bldr.toString();
> +  }
> +
> +  public static void vectorToCSVString(Vector vector, boolean namesAsComments,
> +                                       Appendable bldr) throws IOException {
> +    if (namesAsComments && vector instanceof NamedVector){
> +      bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
> +    }
> +    Iterator<Vector.Element> iter = vector.iterator();
> +    boolean first = true;
> +    while (iter.hasNext()) {
> +      if (first) {
> +        first = false;
> +      } else {
> +        bldr.append(",");
> +      }
> +      Vector.Element elt = iter.next();
> +      bldr.append(String.valueOf(elt.get()));
> +    }
> +    bldr.append('\n');
> +  }
> +
> +
>   /**
>    * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
>    * each the ith entry is the term for the ith vector cell.
>    */
> -  public static String vectorToString(Vector vector, String[] dictionary) {
> +  public static String vectorToJSONString(Vector vector, String[] dictionary) {
>     StringBuilder bldr = new StringBuilder(2048);
>
>     if (vector instanceof NamedVector) {
> @@ -67,12 +98,13 @@ public final class VectorHelper {
>       if (dictionary != null) {
>         bldr.append(dictionary[elt.index()]);
>       } else {
> -        bldr.append(elt.index());
> +        bldr.append(String.valueOf(elt.index()));
>       }
> -      bldr.append(':').append(elt.get());
> +      bldr.append(':').append(String.valueOf(elt.get()));
>     }
>     return bldr.append('}').toString();
>   }
> +
>
>   /**
>    * Read in a dictionary file. Format is:
>
> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,94 @@
> +package org.apache.mahout.utils.vectors.csv;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.commons.csv.CSVParser;
> +import org.apache.commons.csv.CSVStrategy;
> +import org.apache.mahout.math.DenseVector;
> +import org.apache.mahout.math.Vector;
> +
> +import java.io.BufferedReader;
> +import java.io.IOException;
> +import java.io.Reader;
> +import java.util.Iterator;
> +
> +
> +/**
> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
> + * <br/>
> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
> + * <p/>
> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
> + * <p/>
> + * The Iterator is not thread-safe.
> + *
> + *
> + **/
> +public class CSVVectorIterable implements Iterable<Vector> {
> +  protected CSVParser parser;
> +  protected String [] line;
> +
> +  public CSVVectorIterable(Reader reader) throws IOException {
> +    parser = new CSVParser(reader);
> +    line = parser.getLine();
> +  }
> +
> +  public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
> +    parser = new CSVParser(reader, strategy);
> +    line = parser.getLine();
> +  }
> +
> +
> +  @Override
> +  public Iterator<Vector> iterator() {
> +    return new CSVIterator();
> +  }
> +
> +  private class CSVIterator implements Iterator<Vector>{
> +
> +
> +    public CSVIterator() {
> +    }
> +
> +    @Override
> +    public boolean hasNext() {
> +      return line != null;
> +    }
> +
> +    @Override
> +    public Vector next() {
> +
> +      Vector result = null;
> +      result = new DenseVector(line.length);
> +      for (int i = 0; i < line.length; i++) {
> +        result.setQuick(i, Double.parseDouble(line[i]));
> +      }
> +      //move the line forward
> +      try {
> +        line = parser.getLine();
> +      } catch (IOException e) {
> +        throw new RuntimeException(e);
> +      }
> +      return result;
> +    }
> +
> +    @Override
> +    public void remove() {
> +      throw new UnsupportedOperationException();
> +    }
> +  }
> +}
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>  /**
>  * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
>  */
> -public class JWriterVectorWriter implements VectorWriter {
> -  private final Writer writer;
> +public class JWriterVectorWriter extends VectorWriter {
> +  protected final Writer writer;
>
>   public JWriterVectorWriter(Writer writer) {
>     this.writer = writer;
> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>       if (result >= maxDocs) {
>         break;
>       }
> -      writer.write(vector.asFormatString());
> -      writer.write('\n');
> -
> +      formatVector(vector);
>       result++;
>     }
>     return result;
>   }
> -
> +
> +  protected void formatVector(Vector vector) throws IOException {
> +    writer.write(vector.asFormatString());
> +    writer.write('\n');
> +  }
> +
> +  @Override
> +  public void write(Vector vector) throws IOException {
> +    formatVector(vector);
> +  }
> +
>   @Override
>   public void close() throws IOException {
>     writer.flush();
>
> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,105 @@
> +package org.apache.mahout.utils.vectors.io;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.ContentSummary;
> +import org.apache.hadoop.fs.FileSystem;
> +import org.apache.hadoop.fs.Path;
> +import org.apache.hadoop.io.SequenceFile;
> +import org.apache.hadoop.io.Writable;
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.math.VectorWritable;
> +
> +import java.io.IOException;
> +import java.util.Iterator;
> +
> +
> +/**
> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
> + *
> + **/
> +public class SequenceFileVectorIterable implements Iterable<Vector>{
> +  protected SequenceFile.Reader reader;
> +  protected long fileLen;
> +  protected Writable keyWritable;
> +  protected Writable valueWritable;
> +  protected boolean useKey;
> +
> +  /**
> +   * Construct the Iterable
> +   * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
> +   * @param file The {@link org.apache.hadoop.fs.Path} containing the file
> +   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
> +   * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
> +   * @throws IllegalAccessException
> +   * @throws InstantiationException
> +   * @throws IOException
> +   */
> +  public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
> +    this.reader = new SequenceFile.Reader(fs, file, conf);
> +    ContentSummary summary = fs.getContentSummary(file);
> +    fileLen = summary.getLength();
> +    this.useKey = useKey;
> +    keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
> +    valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
> +  }
> +
> +  /**
> +   * The Iterator returned does not support remove()
> +   * @return The {@link java.util.Iterator}
> +   */
> +  public Iterator<Vector> iterator() {
> +    return new SFIterator();
> +
> +  }
> +
> +  private final class SFIterator implements Iterator<Vector>{
> +    @Override
> +    public boolean hasNext() {
> +      //TODO: is this legitimate?  We can't call next here since it breaks the iterator contract
> +      try {
> +        return reader.getPosition() < fileLen;
> +      } catch (IOException e) {
> +        return false;
> +      }
> +    }
> +
> +    @Override
> +    public Vector next() {
> +      Vector result = null;
> +      boolean valid = false;
> +      try {
> +        valid = reader.next(keyWritable, valueWritable);
> +        if (valid){
> +          result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
> +        }
> +      } catch (IOException e) {
> +        throw new RuntimeException(e);
> +      }
> +
> +      return result;
> +    }
> +
> +    /**
> +     * Not supported
> +     */
> +    public void remove() {
> +      throw new UnsupportedOperationException();
> +    }
> +  }
> +}
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>  *
>  * Closes the writer when done
>  */
> -public class SequenceFileVectorWriter implements VectorWriter {
> +public class SequenceFileVectorWriter extends VectorWriter {
>   private final SequenceFile.Writer writer;
> -
> +  long recNum = 0;
>   public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>     this.writer = writer;
>   }
>
>   @Override
>   public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
> -    long recNum = 0;
> +
>     for (Vector point : iterable) {
>       if (recNum >= maxDocs) {
>         break;
> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>     }
>     return recNum;
>   }
> -
> +
> +  @Override
> +  public void write(Vector vector) throws IOException {
> +    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
> +
> +  }
> +
>   @Override
>   public long write(Iterable<Vector> iterable) throws IOException {
>     return write(iterable, Long.MAX_VALUE);
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
> @@ -21,7 +21,7 @@ import java.io.IOException;
>
>  import org.apache.mahout.math.Vector;
>
> -public interface VectorWriter {
> +public abstract class VectorWriter {
>   /**
>    * Write all values in the Iterable to the output
>    * @param iterable The {@link Iterable} to loop over
> @@ -29,7 +29,15 @@ public interface VectorWriter {
>    * @throws IOException if there was a problem writing
>    *
>    */
> -  long write(Iterable<Vector> iterable) throws IOException;
> +  public abstract long write(Iterable<Vector> iterable) throws IOException;
> +
> +  /**
> +   * Write out a vector
> +   *
> +   * @param vector The {@link org.apache.mahout.math.Vector} to write
> +   * @throws IOException
> +   */
> +  public abstract void write(Vector vector) throws IOException;
>
>   /**
>    * Write the first <code>maxDocs</code> to the output.
> @@ -38,12 +46,12 @@ public interface VectorWriter {
>    * @return The number of docs written
>    * @throws IOException if there was a problem writing
>    */
> -  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
> +  public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>
>   /**
>    * Close any internally held resources.  If external Writers are passed in, the implementation should indicate
>    * whether it also closes them
>    * @throws IOException if there was an issue closing the item
>    */
> -  void close() throws IOException;
> +  public abstract void close() throws IOException;
>  }
>
> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,60 @@
> +package org.apache.mahout.utils.vectors.csv;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.utils.MahoutTestCase;
> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
> +import org.apache.mahout.utils.vectors.VectorHelper;
> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
> +import org.junit.Test;
> +
> +import java.io.IOException;
> +import java.io.StringReader;
> +import java.io.StringWriter;
> +
> +
> +/**
> + *
> + *
> + **/
> +public class CSVVectorIterableTest extends MahoutTestCase {
> +
> +
> +  @Test
> +  public void test() throws Exception {
> +
> +    StringWriter sWriter = new StringWriter();
> +    JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
> +
> +      protected void formatVector(Vector vector) throws IOException {
> +        String vecStr = VectorHelper.vectorToCSVString(vector, false);
> +        writer.write(vecStr);
> +      }
> +    };
> +    Iterable<Vector> iter = new RandomVectorIterable(50);
> +    jwvw.write(iter);
> +    jwvw.close();
> +    CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
> +    int count = 0;
> +    for (Vector vector : csvIter) {
> +      //System.out.println("Vec: " + vector);
> +      count++;
> +    }
> +    assertEquals(50, count);
> +  }
> +}
>
> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
> @@ -0,0 +1,39 @@
> +package org.apache.mahout.utils.vectors.io;
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.hadoop.fs.FileSystem;
> +import org.apache.hadoop.fs.Path;
> +import org.apache.hadoop.io.LongWritable;
> +import org.apache.hadoop.io.SequenceFile;
> +import org.apache.mahout.math.Vector;
> +import org.apache.mahout.math.VectorWritable;
> +import org.apache.mahout.utils.MahoutTestCase;
> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
> +import org.junit.Test;
> +
> +
> +/**
> + *
> + *
> + **/
> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
> +
> +
> +  @Test
> +  public void testSFVI() throws Exception {
> +    Path path = getTestTempFilePath("sfvw");
> +    Configuration conf = new Configuration();
> +    FileSystem fs = FileSystem.get(conf);
> +    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
> +    SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
> +    Iterable<Vector> iter = new RandomVectorIterable(50);
> +    writer.write(iter);
> +    writer.close();
> +    SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
> +    int count = 0;
> +    for (Vector vector : sfVIter) {
> +      count++;
> +    }
> +    assertEquals(50, count);
> +  }
> +}
>
>
>

Re: svn commit: r1085397 - in /mahout/trunk/utils: ./ src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/ src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs

Posted by Dmitriy Lyubimov <dl...@gmail.com>.
No prob. was too late to comment :) i will tackle it in one of 622 patches :)

On Sat, Mar 26, 2011 at 4:03 AM, Grant Ingersoll <gs...@apache.org> wrote:
> Ah, OK.  Good to know.  Hadn't followed that one.   Feel free to change as appropriate or I can.
>
> On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote:
>
>> That would be a typicall change i am trying to fix with 622:
>> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
>>
>>
>>
>> On Fri, Mar 25, 2011 at 7:28 AM,  <gs...@apache.org> wrote:
>>> Author: gsingers
>>> Date: Fri Mar 25 14:28:12 2011
>>> New Revision: 1085397
>>>
>>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
>>> Log:
>>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>>>
>>> Added:
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> Modified:
>>>    mahout/trunk/utils/pom.xml
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>>
>>> Modified: mahout/trunk/utils/pom.xml
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/pom.xml (original)
>>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
>>> @@ -142,6 +142,11 @@
>>>       <type>test-jar</type>
>>>       <scope>test</scope>
>>>     </dependency>
>>> +    <dependency>
>>> +      <groupId>org.apache.solr</groupId>
>>> +      <artifactId>solr-commons-csv</artifactId>
>>> +      <version>1.4.1</version>
>>> +    </dependency>
>>>
>>>     <dependency>
>>>       <groupId>junit</groupId>
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
>>> @@ -77,16 +77,22 @@ public final class VectorDumper {
>>>     Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>>>             abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>>>             "The dictionary file type (text|sequencefile)").withShortName("dt").create();
>>> -    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>>> -            "Output the centroid as JSON.  Otherwise it substitutes in the terms for vector cell entries")
>>> +    Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>>> +            "Output the Vector as JSON.  Otherwise it substitutes in the terms for vector cell entries")
>>>             .withShortName("j").create();
>>> +    Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
>>> +            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries")
>>> +            .withShortName("c").create();
>>> +    Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
>>> +            "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
>>> +            .withShortName("n").create();
>>>     Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>>>             withDescription("Dump only the size of the vector").withShortName("sz").create();
>>>     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>>>             .create();
>>>
>>>     Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
>>> -            dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
>>> +            dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>>>             printKeyOpt).withOption(sizeOpt).create();
>>>
>>>     try {
>>> @@ -122,10 +128,12 @@ public final class VectorDumper {
>>>             throw new OptionException(dictTypeOpt);
>>>           }
>>>         }
>>> -        boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
>>> +        boolean useJSON = cmdLine.hasOption(jsonOpt);
>>> +        boolean useCSV = cmdLine.hasOption(csvOpt);
>>> +
>>>         boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>>>         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
>>> -
>>> +        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>>>         Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>>         Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>>>         boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
>>> @@ -140,6 +148,16 @@ public final class VectorDumper {
>>>           try {
>>>             boolean printKey = cmdLine.hasOption(printKeyOpt);
>>>             long i = 0;
>>> +            if (useCSV && dictionary != null){
>>> +              writer.write("#");
>>> +              for (int j = 0; j < dictionary.length; j++) {
>>> +                writer.write(dictionary[j]);
>>> +                if (j < dictionary.length - 1){
>>> +                  writer.write(',');
>>> +                }
>>> +              }
>>> +              writer.write('\n');
>>> +            }
>>>             while (reader.next(keyWritable, valueWritable)) {
>>>               if (printKey) {
>>>                 Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
>>> @@ -159,7 +177,14 @@ public final class VectorDumper {
>>>                 writer.write(String.valueOf(vector.size()));
>>>                 writer.write('\n');
>>>               } else {
>>> -                String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
>>> +                String fmtStr;
>>> +                if (useJSON){
>>> +                  fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
>>> +                } else if (useCSV){
>>> +                  fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
>>> +                } else {
>>> +                  fmtStr = vector.asFormatString();
>>> +                }
>>>                 writer.write(fmtStr);
>>>                 writer.write('\n');
>>>               }
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
>>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>>>  public final class VectorHelper {
>>>
>>>   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
>>> +
>>>
>>>   private VectorHelper() { }
>>> -
>>> +
>>> +  public static String vectorToCSVString(Vector vector, boolean namesAsComments){
>>> +    StringBuilder bldr = new StringBuilder(2048);
>>> +    try {
>>> +      vectorToCSVString(vector, namesAsComments, bldr);
>>> +    } catch (IOException e) {
>>> +      throw new RuntimeException(e);
>>> +    }
>>> +    return bldr.toString();
>>> +  }
>>> +
>>> +  public static void vectorToCSVString(Vector vector, boolean namesAsComments,
>>> +                                       Appendable bldr) throws IOException {
>>> +    if (namesAsComments && vector instanceof NamedVector){
>>> +      bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
>>> +    }
>>> +    Iterator<Vector.Element> iter = vector.iterator();
>>> +    boolean first = true;
>>> +    while (iter.hasNext()) {
>>> +      if (first) {
>>> +        first = false;
>>> +      } else {
>>> +        bldr.append(",");
>>> +      }
>>> +      Vector.Element elt = iter.next();
>>> +      bldr.append(String.valueOf(elt.get()));
>>> +    }
>>> +    bldr.append('\n');
>>> +  }
>>> +
>>> +
>>>   /**
>>>    * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
>>>    * each the ith entry is the term for the ith vector cell.
>>>    */
>>> -  public static String vectorToString(Vector vector, String[] dictionary) {
>>> +  public static String vectorToJSONString(Vector vector, String[] dictionary) {
>>>     StringBuilder bldr = new StringBuilder(2048);
>>>
>>>     if (vector instanceof NamedVector) {
>>> @@ -67,12 +98,13 @@ public final class VectorHelper {
>>>       if (dictionary != null) {
>>>         bldr.append(dictionary[elt.index()]);
>>>       } else {
>>> -        bldr.append(elt.index());
>>> +        bldr.append(String.valueOf(elt.index()));
>>>       }
>>> -      bldr.append(':').append(elt.get());
>>> +      bldr.append(':').append(String.valueOf(elt.get()));
>>>     }
>>>     return bldr.append('}').toString();
>>>   }
>>> +
>>>
>>>   /**
>>>    * Read in a dictionary file. Format is:
>>>
>>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,94 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.commons.csv.CSVParser;
>>> +import org.apache.commons.csv.CSVStrategy;
>>> +import org.apache.mahout.math.DenseVector;
>>> +import org.apache.mahout.math.Vector;
>>> +
>>> +import java.io.BufferedReader;
>>> +import java.io.IOException;
>>> +import java.io.Reader;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
>>> + * <br/>
>>> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
>>> + * <p/>
>>> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
>>> + * <p/>
>>> + * The Iterator is not thread-safe.
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterable implements Iterable<Vector> {
>>> +  protected CSVParser parser;
>>> +  protected String [] line;
>>> +
>>> +  public CSVVectorIterable(Reader reader) throws IOException {
>>> +    parser = new CSVParser(reader);
>>> +    line = parser.getLine();
>>> +  }
>>> +
>>> +  public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
>>> +    parser = new CSVParser(reader, strategy);
>>> +    line = parser.getLine();
>>> +  }
>>> +
>>> +
>>> +  @Override
>>> +  public Iterator<Vector> iterator() {
>>> +    return new CSVIterator();
>>> +  }
>>> +
>>> +  private class CSVIterator implements Iterator<Vector>{
>>> +
>>> +
>>> +    public CSVIterator() {
>>> +    }
>>> +
>>> +    @Override
>>> +    public boolean hasNext() {
>>> +      return line != null;
>>> +    }
>>> +
>>> +    @Override
>>> +    public Vector next() {
>>> +
>>> +      Vector result = null;
>>> +      result = new DenseVector(line.length);
>>> +      for (int i = 0; i < line.length; i++) {
>>> +        result.setQuick(i, Double.parseDouble(line[i]));
>>> +      }
>>> +      //move the line forward
>>> +      try {
>>> +        line = parser.getLine();
>>> +      } catch (IOException e) {
>>> +        throw new RuntimeException(e);
>>> +      }
>>> +      return result;
>>> +    }
>>> +
>>> +    @Override
>>> +    public void remove() {
>>> +      throw new UnsupportedOperationException();
>>> +    }
>>> +  }
>>> +}
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>>>  /**
>>>  * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
>>>  */
>>> -public class JWriterVectorWriter implements VectorWriter {
>>> -  private final Writer writer;
>>> +public class JWriterVectorWriter extends VectorWriter {
>>> +  protected final Writer writer;
>>>
>>>   public JWriterVectorWriter(Writer writer) {
>>>     this.writer = writer;
>>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>>>       if (result >= maxDocs) {
>>>         break;
>>>       }
>>> -      writer.write(vector.asFormatString());
>>> -      writer.write('\n');
>>> -
>>> +      formatVector(vector);
>>>       result++;
>>>     }
>>>     return result;
>>>   }
>>> -
>>> +
>>> +  protected void formatVector(Vector vector) throws IOException {
>>> +    writer.write(vector.asFormatString());
>>> +    writer.write('\n');
>>> +  }
>>> +
>>> +  @Override
>>> +  public void write(Vector vector) throws IOException {
>>> +    formatVector(vector);
>>> +  }
>>> +
>>>   @Override
>>>   public void close() throws IOException {
>>>     writer.flush();
>>>
>>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,105 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.ContentSummary;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.hadoop.io.Writable;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +
>>> +import java.io.IOException;
>>> +import java.util.Iterator;
>>> +
>>> +
>>> +/**
>>> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterable implements Iterable<Vector>{
>>> +  protected SequenceFile.Reader reader;
>>> +  protected long fileLen;
>>> +  protected Writable keyWritable;
>>> +  protected Writable valueWritable;
>>> +  protected boolean useKey;
>>> +
>>> +  /**
>>> +   * Construct the Iterable
>>> +   * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
>>> +   * @param file The {@link org.apache.hadoop.fs.Path} containing the file
>>> +   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
>>> +   * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
>>> +   * @throws IllegalAccessException
>>> +   * @throws InstantiationException
>>> +   * @throws IOException
>>> +   */
>>> +  public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
>>> +    this.reader = new SequenceFile.Reader(fs, file, conf);
>>> +    ContentSummary summary = fs.getContentSummary(file);
>>> +    fileLen = summary.getLength();
>>> +    this.useKey = useKey;
>>> +    keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>> +    valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>>> +  }
>>> +
>>> +  /**
>>> +   * The Iterator returned does not support remove()
>>> +   * @return The {@link java.util.Iterator}
>>> +   */
>>> +  public Iterator<Vector> iterator() {
>>> +    return new SFIterator();
>>> +
>>> +  }
>>> +
>>> +  private final class SFIterator implements Iterator<Vector>{
>>> +    @Override
>>> +    public boolean hasNext() {
>>> +      //TODO: is this legitimate?  We can't call next here since it breaks the iterator contract
>>> +      try {
>>> +        return reader.getPosition() < fileLen;
>>> +      } catch (IOException e) {
>>> +        return false;
>>> +      }
>>> +    }
>>> +
>>> +    @Override
>>> +    public Vector next() {
>>> +      Vector result = null;
>>> +      boolean valid = false;
>>> +      try {
>>> +        valid = reader.next(keyWritable, valueWritable);
>>> +        if (valid){
>>> +          result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
>>> +        }
>>> +      } catch (IOException e) {
>>> +        throw new RuntimeException(e);
>>> +      }
>>> +
>>> +      return result;
>>> +    }
>>> +
>>> +    /**
>>> +     * Not supported
>>> +     */
>>> +    public void remove() {
>>> +      throw new UnsupportedOperationException();
>>> +    }
>>> +  }
>>> +}
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>>>  *
>>>  * Closes the writer when done
>>>  */
>>> -public class SequenceFileVectorWriter implements VectorWriter {
>>> +public class SequenceFileVectorWriter extends VectorWriter {
>>>   private final SequenceFile.Writer writer;
>>> -
>>> +  long recNum = 0;
>>>   public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>>>     this.writer = writer;
>>>   }
>>>
>>>   @Override
>>>   public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
>>> -    long recNum = 0;
>>> +
>>>     for (Vector point : iterable) {
>>>       if (recNum >= maxDocs) {
>>>         break;
>>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>>>     }
>>>     return recNum;
>>>   }
>>> -
>>> +
>>> +  @Override
>>> +  public void write(Vector vector) throws IOException {
>>> +    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
>>> +
>>> +  }
>>> +
>>>   @Override
>>>   public long write(Iterable<Vector> iterable) throws IOException {
>>>     return write(iterable, Long.MAX_VALUE);
>>>
>>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
>>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
>>> @@ -21,7 +21,7 @@ import java.io.IOException;
>>>
>>>  import org.apache.mahout.math.Vector;
>>>
>>> -public interface VectorWriter {
>>> +public abstract class VectorWriter {
>>>   /**
>>>    * Write all values in the Iterable to the output
>>>    * @param iterable The {@link Iterable} to loop over
>>> @@ -29,7 +29,15 @@ public interface VectorWriter {
>>>    * @throws IOException if there was a problem writing
>>>    *
>>>    */
>>> -  long write(Iterable<Vector> iterable) throws IOException;
>>> +  public abstract long write(Iterable<Vector> iterable) throws IOException;
>>> +
>>> +  /**
>>> +   * Write out a vector
>>> +   *
>>> +   * @param vector The {@link org.apache.mahout.math.Vector} to write
>>> +   * @throws IOException
>>> +   */
>>> +  public abstract void write(Vector vector) throws IOException;
>>>
>>>   /**
>>>    * Write the first <code>maxDocs</code> to the output.
>>> @@ -38,12 +46,12 @@ public interface VectorWriter {
>>>    * @return The number of docs written
>>>    * @throws IOException if there was a problem writing
>>>    */
>>> -  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>> +  public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>>>
>>>   /**
>>>    * Close any internally held resources.  If external Writers are passed in, the implementation should indicate
>>>    * whether it also closes them
>>>    * @throws IOException if there was an issue closing the item
>>>    */
>>> -  void close() throws IOException;
>>> +  public abstract void close() throws IOException;
>>>  }
>>>
>>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
>>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,60 @@
>>> +package org.apache.mahout.utils.vectors.csv;
>>> +/**
>>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>>> + * contributor license agreements.  See the NOTICE file distributed with
>>> + * this work for additional information regarding copyright ownership.
>>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>>> + * (the "License"); you may not use this file except in compliance with
>>> + * the License.  You may obtain a copy of the License at
>>> + *
>>> + *     http://www.apache.org/licenses/LICENSE-2.0
>>> + *
>>> + * Unless required by applicable law or agreed to in writing, software
>>> + * distributed under the License is distributed on an "AS IS" BASIS,
>>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>>> + * See the License for the specific language governing permissions and
>>> + * limitations under the License.
>>> + */
>>> +
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.apache.mahout.utils.vectors.VectorHelper;
>>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
>>> +import org.junit.Test;
>>> +
>>> +import java.io.IOException;
>>> +import java.io.StringReader;
>>> +import java.io.StringWriter;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class CSVVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> +  @Test
>>> +  public void test() throws Exception {
>>> +
>>> +    StringWriter sWriter = new StringWriter();
>>> +    JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
>>> +
>>> +      protected void formatVector(Vector vector) throws IOException {
>>> +        String vecStr = VectorHelper.vectorToCSVString(vector, false);
>>> +        writer.write(vecStr);
>>> +      }
>>> +    };
>>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>>> +    jwvw.write(iter);
>>> +    jwvw.close();
>>> +    CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
>>> +    int count = 0;
>>> +    for (Vector vector : csvIter) {
>>> +      //System.out.println("Vec: " + vector);
>>> +      count++;
>>> +    }
>>> +    assertEquals(50, count);
>>> +  }
>>> +}
>>>
>>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
>>> ==============================================================================
>>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
>>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
>>> @@ -0,0 +1,39 @@
>>> +package org.apache.mahout.utils.vectors.io;
>>> +
>>> +import org.apache.hadoop.conf.Configuration;
>>> +import org.apache.hadoop.fs.FileSystem;
>>> +import org.apache.hadoop.fs.Path;
>>> +import org.apache.hadoop.io.LongWritable;
>>> +import org.apache.hadoop.io.SequenceFile;
>>> +import org.apache.mahout.math.Vector;
>>> +import org.apache.mahout.math.VectorWritable;
>>> +import org.apache.mahout.utils.MahoutTestCase;
>>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>>> +import org.junit.Test;
>>> +
>>> +
>>> +/**
>>> + *
>>> + *
>>> + **/
>>> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
>>> +
>>> +
>>> +  @Test
>>> +  public void testSFVI() throws Exception {
>>> +    Path path = getTestTempFilePath("sfvw");
>>> +    Configuration conf = new Configuration();
>>> +    FileSystem fs = FileSystem.get(conf);
>>> +    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
>>> +    SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
>>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>>> +    writer.write(iter);
>>> +    writer.close();
>>> +    SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
>>> +    int count = 0;
>>> +    for (Vector vector : sfVIter) {
>>> +      count++;
>>> +    }
>>> +    assertEquals(50, count);
>>> +  }
>>> +}
>>>
>>>
>>>
>
> --------------------------
> Grant Ingersoll
> http://www.lucidimagination.com/
>
> Search the Lucene ecosystem docs using Solr/Lucene:
> http://www.lucidimagination.com/search
>
>

Re: svn commit: r1085397 - in /mahout/trunk/utils: ./ src/main/java/org/apache/mahout/utils/vectors/ src/main/java/org/apache/mahout/utils/vectors/csv/ src/main/java/org/apache/mahout/utils/vectors/io/ src/test/java/org/apache/mahout/utils/vectors/cs

Posted by Grant Ingersoll <gs...@apache.org>.
Ah, OK.  Good to know.  Hadn't followed that one.   Feel free to change as appropriate or I can.

On Mar 25, 2011, at 9:12 PM, Dmitriy Lyubimov wrote:

> That would be a typicall change i am trying to fix with 622:
> http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?view=diff&r1=1085396&r2=1085397&pathrev=1085397
> 
> 
> 
> On Fri, Mar 25, 2011 at 7:28 AM,  <gs...@apache.org> wrote:
>> Author: gsingers
>> Date: Fri Mar 25 14:28:12 2011
>> New Revision: 1085397
>> 
>> URL: http://svn.apache.org/viewvc?rev=1085397&view=rev
>> Log:
>> MAHOUT-548: add in some CSV support for creating vectors, as well as a few other fixes for working with vectors
>> 
>> Added:
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/
>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>>    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>> Modified:
>>    mahout/trunk/utils/pom.xml
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>>    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>> 
>> Modified: mahout/trunk/utils/pom.xml
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/pom.xml?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/pom.xml (original)
>> +++ mahout/trunk/utils/pom.xml Fri Mar 25 14:28:12 2011
>> @@ -142,6 +142,11 @@
>>       <type>test-jar</type>
>>       <scope>test</scope>
>>     </dependency>
>> +    <dependency>
>> +      <groupId>org.apache.solr</groupId>
>> +      <artifactId>solr-commons-csv</artifactId>
>> +      <version>1.4.1</version>
>> +    </dependency>
>> 
>>     <dependency>
>>       <groupId>junit</groupId>
>> 
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Fri Mar 25 14:28:12 2011
>> @@ -77,16 +77,22 @@ public final class VectorDumper {
>>     Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
>>             abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
>>             "The dictionary file type (text|sequencefile)").withShortName("dt").create();
>> -    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>> -            "Output the centroid as JSON.  Otherwise it substitutes in the terms for vector cell entries")
>> +    Option jsonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
>> +            "Output the Vector as JSON.  Otherwise it substitutes in the terms for vector cell entries")
>>             .withShortName("j").create();
>> +    Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
>> +            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries")
>> +            .withShortName("c").create();
>> +    Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
>> +            "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
>> +            .withShortName("n").create();
>>     Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
>>             withDescription("Dump only the size of the vector").withShortName("sz").create();
>>     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>>             .create();
>> 
>>     Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
>> -            dictTypeOpt).withOption(dictOpt).withOption(centroidJSonOpt).withOption(vectorAsKeyOpt).withOption(
>> +            dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
>>             printKeyOpt).withOption(sizeOpt).create();
>> 
>>     try {
>> @@ -122,10 +128,12 @@ public final class VectorDumper {
>>             throw new OptionException(dictTypeOpt);
>>           }
>>         }
>> -        boolean useJSON = cmdLine.hasOption(centroidJSonOpt);
>> +        boolean useJSON = cmdLine.hasOption(jsonOpt);
>> +        boolean useCSV = cmdLine.hasOption(csvOpt);
>> +
>>         boolean sizeOnly = cmdLine.hasOption(sizeOpt);
>>         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
>> -
>> +        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
>>         Writable keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>>         Writable valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>>         boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
>> @@ -140,6 +148,16 @@ public final class VectorDumper {
>>           try {
>>             boolean printKey = cmdLine.hasOption(printKeyOpt);
>>             long i = 0;
>> +            if (useCSV && dictionary != null){
>> +              writer.write("#");
>> +              for (int j = 0; j < dictionary.length; j++) {
>> +                writer.write(dictionary[j]);
>> +                if (j < dictionary.length - 1){
>> +                  writer.write(',');
>> +                }
>> +              }
>> +              writer.write('\n');
>> +            }
>>             while (reader.next(keyWritable, valueWritable)) {
>>               if (printKey) {
>>                 Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
>> @@ -159,7 +177,14 @@ public final class VectorDumper {
>>                 writer.write(String.valueOf(vector.size()));
>>                 writer.write('\n');
>>               } else {
>> -                String fmtStr = useJSON ? vector.asFormatString() : VectorHelper.vectorToString(vector, dictionary);
>> +                String fmtStr;
>> +                if (useJSON){
>> +                  fmtStr = VectorHelper.vectorToJSONString(vector, dictionary);
>> +                } else if (useCSV){
>> +                  fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
>> +                } else {
>> +                  fmtStr = vector.asFormatString();
>> +                }
>>                 writer.write(fmtStr);
>>                 writer.write('\n');
>>               }
>> 
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Fri Mar 25 14:28:12 2011
>> @@ -40,14 +40,45 @@ import org.apache.mahout.math.map.OpenOb
>>  public final class VectorHelper {
>> 
>>   private static final Pattern TAB_PATTERN = Pattern.compile("\t");
>> +
>> 
>>   private VectorHelper() { }
>> -
>> +
>> +  public static String vectorToCSVString(Vector vector, boolean namesAsComments){
>> +    StringBuilder bldr = new StringBuilder(2048);
>> +    try {
>> +      vectorToCSVString(vector, namesAsComments, bldr);
>> +    } catch (IOException e) {
>> +      throw new RuntimeException(e);
>> +    }
>> +    return bldr.toString();
>> +  }
>> +
>> +  public static void vectorToCSVString(Vector vector, boolean namesAsComments,
>> +                                       Appendable bldr) throws IOException {
>> +    if (namesAsComments && vector instanceof NamedVector){
>> +      bldr.append("#").append(((NamedVector)vector).getName()).append('\n');
>> +    }
>> +    Iterator<Vector.Element> iter = vector.iterator();
>> +    boolean first = true;
>> +    while (iter.hasNext()) {
>> +      if (first) {
>> +        first = false;
>> +      } else {
>> +        bldr.append(",");
>> +      }
>> +      Vector.Element elt = iter.next();
>> +      bldr.append(String.valueOf(elt.get()));
>> +    }
>> +    bldr.append('\n');
>> +  }
>> +
>> +
>>   /**
>>    * @return a String from a vector that fills in the values with the appropriate value from a dictionary where
>>    * each the ith entry is the term for the ith vector cell.
>>    */
>> -  public static String vectorToString(Vector vector, String[] dictionary) {
>> +  public static String vectorToJSONString(Vector vector, String[] dictionary) {
>>     StringBuilder bldr = new StringBuilder(2048);
>> 
>>     if (vector instanceof NamedVector) {
>> @@ -67,12 +98,13 @@ public final class VectorHelper {
>>       if (dictionary != null) {
>>         bldr.append(dictionary[elt.index()]);
>>       } else {
>> -        bldr.append(elt.index());
>> +        bldr.append(String.valueOf(elt.index()));
>>       }
>> -      bldr.append(':').append(elt.get());
>> +      bldr.append(':').append(String.valueOf(elt.get()));
>>     }
>>     return bldr.append('}').toString();
>>   }
>> +
>> 
>>   /**
>>    * Read in a dictionary file. Format is:
>> 
>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java (added)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterable.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,94 @@
>> +package org.apache.mahout.utils.vectors.csv;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements.  See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License.  You may obtain a copy of the License at
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.commons.csv.CSVParser;
>> +import org.apache.commons.csv.CSVStrategy;
>> +import org.apache.mahout.math.DenseVector;
>> +import org.apache.mahout.math.Vector;
>> +
>> +import java.io.BufferedReader;
>> +import java.io.IOException;
>> +import java.io.Reader;
>> +import java.util.Iterator;
>> +
>> +
>> +/**
>> + * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
>> + * <br/>
>> + * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
>> + * <p/>
>> + * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
>> + * <p/>
>> + * The Iterator is not thread-safe.
>> + *
>> + *
>> + **/
>> +public class CSVVectorIterable implements Iterable<Vector> {
>> +  protected CSVParser parser;
>> +  protected String [] line;
>> +
>> +  public CSVVectorIterable(Reader reader) throws IOException {
>> +    parser = new CSVParser(reader);
>> +    line = parser.getLine();
>> +  }
>> +
>> +  public CSVVectorIterable(Reader reader, CSVStrategy strategy) throws IOException {
>> +    parser = new CSVParser(reader, strategy);
>> +    line = parser.getLine();
>> +  }
>> +
>> +
>> +  @Override
>> +  public Iterator<Vector> iterator() {
>> +    return new CSVIterator();
>> +  }
>> +
>> +  private class CSVIterator implements Iterator<Vector>{
>> +
>> +
>> +    public CSVIterator() {
>> +    }
>> +
>> +    @Override
>> +    public boolean hasNext() {
>> +      return line != null;
>> +    }
>> +
>> +    @Override
>> +    public Vector next() {
>> +
>> +      Vector result = null;
>> +      result = new DenseVector(line.length);
>> +      for (int i = 0; i < line.length; i++) {
>> +        result.setQuick(i, Double.parseDouble(line[i]));
>> +      }
>> +      //move the line forward
>> +      try {
>> +        line = parser.getLine();
>> +      } catch (IOException e) {
>> +        throw new RuntimeException(e);
>> +      }
>> +      return result;
>> +    }
>> +
>> +    @Override
>> +    public void remove() {
>> +      throw new UnsupportedOperationException();
>> +    }
>> +  }
>> +}
>> 
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -25,8 +25,8 @@ import org.apache.mahout.math.Vector;
>>  /**
>>  * Write out the vectors to any {@link java.io.Writer} using {@link org.apache.mahout.math.Vector#asFormatString()}.
>>  */
>> -public class JWriterVectorWriter implements VectorWriter {
>> -  private final Writer writer;
>> +public class JWriterVectorWriter extends VectorWriter {
>> +  protected final Writer writer;
>> 
>>   public JWriterVectorWriter(Writer writer) {
>>     this.writer = writer;
>> @@ -45,14 +45,22 @@ public class JWriterVectorWriter impleme
>>       if (result >= maxDocs) {
>>         break;
>>       }
>> -      writer.write(vector.asFormatString());
>> -      writer.write('\n');
>> -
>> +      formatVector(vector);
>>       result++;
>>     }
>>     return result;
>>   }
>> -
>> +
>> +  protected void formatVector(Vector vector) throws IOException {
>> +    writer.write(vector.asFormatString());
>> +    writer.write('\n');
>> +  }
>> +
>> +  @Override
>> +  public void write(Vector vector) throws IOException {
>> +    formatVector(vector);
>> +  }
>> +
>>   @Override
>>   public void close() throws IOException {
>>     writer.flush();
>> 
>> Added: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java (added)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterable.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,105 @@
>> +package org.apache.mahout.utils.vectors.io;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements.  See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License.  You may obtain a copy of the License at
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.ContentSummary;
>> +import org.apache.hadoop.fs.FileSystem;
>> +import org.apache.hadoop.fs.Path;
>> +import org.apache.hadoop.io.SequenceFile;
>> +import org.apache.hadoop.io.Writable;
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.math.VectorWritable;
>> +
>> +import java.io.IOException;
>> +import java.util.Iterator;
>> +
>> +
>> +/**
>> + * Given a Sequence File containing vectors (actually, {@link org.apache.mahout.math.VectorWritable}, iterate over it.
>> + *
>> + **/
>> +public class SequenceFileVectorIterable implements Iterable<Vector>{
>> +  protected SequenceFile.Reader reader;
>> +  protected long fileLen;
>> +  protected Writable keyWritable;
>> +  protected Writable valueWritable;
>> +  protected boolean useKey;
>> +
>> +  /**
>> +   * Construct the Iterable
>> +   * @param fs The {@link org.apache.hadoop.fs.FileSystem} containing the {@link org.apache.hadoop.io.SequenceFile}
>> +   * @param file The {@link org.apache.hadoop.fs.Path} containing the file
>> +   * @param conf The {@link org.apache.hadoop.conf.Configuration} to use
>> +   * @param useKey If true, use the key as the {@link org.apache.mahout.math.VectorWritable}, otherwise use the value
>> +   * @throws IllegalAccessException
>> +   * @throws InstantiationException
>> +   * @throws IOException
>> +   */
>> +  public SequenceFileVectorIterable(FileSystem fs, Path file, Configuration conf, boolean useKey) throws IllegalAccessException, InstantiationException, IOException {
>> +    this.reader = new SequenceFile.Reader(fs, file, conf);
>> +    ContentSummary summary = fs.getContentSummary(file);
>> +    fileLen = summary.getLength();
>> +    this.useKey = useKey;
>> +    keyWritable = reader.getKeyClass().asSubclass(Writable.class).newInstance();
>> +    valueWritable = reader.getValueClass().asSubclass(Writable.class).newInstance();
>> +  }
>> +
>> +  /**
>> +   * The Iterator returned does not support remove()
>> +   * @return The {@link java.util.Iterator}
>> +   */
>> +  public Iterator<Vector> iterator() {
>> +    return new SFIterator();
>> +
>> +  }
>> +
>> +  private final class SFIterator implements Iterator<Vector>{
>> +    @Override
>> +    public boolean hasNext() {
>> +      //TODO: is this legitimate?  We can't call next here since it breaks the iterator contract
>> +      try {
>> +        return reader.getPosition() < fileLen;
>> +      } catch (IOException e) {
>> +        return false;
>> +      }
>> +    }
>> +
>> +    @Override
>> +    public Vector next() {
>> +      Vector result = null;
>> +      boolean valid = false;
>> +      try {
>> +        valid = reader.next(keyWritable, valueWritable);
>> +        if (valid){
>> +          result = ((VectorWritable) (useKey ? keyWritable : valueWritable)).get();
>> +        }
>> +      } catch (IOException e) {
>> +        throw new RuntimeException(e);
>> +      }
>> +
>> +      return result;
>> +    }
>> +
>> +    /**
>> +     * Not supported
>> +     */
>> +    public void remove() {
>> +      throw new UnsupportedOperationException();
>> +    }
>> +  }
>> +}
>> 
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -30,16 +30,16 @@ import org.apache.mahout.math.VectorWrit
>>  *
>>  * Closes the writer when done
>>  */
>> -public class SequenceFileVectorWriter implements VectorWriter {
>> +public class SequenceFileVectorWriter extends VectorWriter {
>>   private final SequenceFile.Writer writer;
>> -
>> +  long recNum = 0;
>>   public SequenceFileVectorWriter(SequenceFile.Writer writer) {
>>     this.writer = writer;
>>   }
>> 
>>   @Override
>>   public long write(Iterable<Vector> iterable, long maxDocs) throws IOException {
>> -    long recNum = 0;
>> +
>>     for (Vector point : iterable) {
>>       if (recNum >= maxDocs) {
>>         break;
>> @@ -51,7 +51,13 @@ public class SequenceFileVectorWriter im
>>     }
>>     return recNum;
>>   }
>> -
>> +
>> +  @Override
>> +  public void write(Vector vector) throws IOException {
>> +    writer.append(new LongWritable(recNum++), new VectorWritable(vector));
>> +
>> +  }
>> +
>>   @Override
>>   public long write(Iterable<Vector> iterable) throws IOException {
>>     return write(iterable, Long.MAX_VALUE);
>> 
>> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=1085397&r1=1085396&r2=1085397&view=diff
>> ==============================================================================
>> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (original)
>> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Fri Mar 25 14:28:12 2011
>> @@ -21,7 +21,7 @@ import java.io.IOException;
>> 
>>  import org.apache.mahout.math.Vector;
>> 
>> -public interface VectorWriter {
>> +public abstract class VectorWriter {
>>   /**
>>    * Write all values in the Iterable to the output
>>    * @param iterable The {@link Iterable} to loop over
>> @@ -29,7 +29,15 @@ public interface VectorWriter {
>>    * @throws IOException if there was a problem writing
>>    *
>>    */
>> -  long write(Iterable<Vector> iterable) throws IOException;
>> +  public abstract long write(Iterable<Vector> iterable) throws IOException;
>> +
>> +  /**
>> +   * Write out a vector
>> +   *
>> +   * @param vector The {@link org.apache.mahout.math.Vector} to write
>> +   * @throws IOException
>> +   */
>> +  public abstract void write(Vector vector) throws IOException;
>> 
>>   /**
>>    * Write the first <code>maxDocs</code> to the output.
>> @@ -38,12 +46,12 @@ public interface VectorWriter {
>>    * @return The number of docs written
>>    * @throws IOException if there was a problem writing
>>    */
>> -  long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>> +  public abstract long write(Iterable<Vector> iterable, long maxDocs) throws IOException;
>> 
>>   /**
>>    * Close any internally held resources.  If external Writers are passed in, the implementation should indicate
>>    * whether it also closes them
>>    * @throws IOException if there was an issue closing the item
>>    */
>> -  void close() throws IOException;
>> +  public abstract void close() throws IOException;
>>  }
>> 
>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java (added)
>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterableTest.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,60 @@
>> +package org.apache.mahout.utils.vectors.csv;
>> +/**
>> + * Licensed to the Apache Software Foundation (ASF) under one or more
>> + * contributor license agreements.  See the NOTICE file distributed with
>> + * this work for additional information regarding copyright ownership.
>> + * The ASF licenses this file to You under the Apache License, Version 2.0
>> + * (the "License"); you may not use this file except in compliance with
>> + * the License.  You may obtain a copy of the License at
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.utils.MahoutTestCase;
>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>> +import org.apache.mahout.utils.vectors.VectorHelper;
>> +import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
>> +import org.junit.Test;
>> +
>> +import java.io.IOException;
>> +import java.io.StringReader;
>> +import java.io.StringWriter;
>> +
>> +
>> +/**
>> + *
>> + *
>> + **/
>> +public class CSVVectorIterableTest extends MahoutTestCase {
>> +
>> +
>> +  @Test
>> +  public void test() throws Exception {
>> +
>> +    StringWriter sWriter = new StringWriter();
>> +    JWriterVectorWriter jwvw = new JWriterVectorWriter(sWriter) {
>> +
>> +      protected void formatVector(Vector vector) throws IOException {
>> +        String vecStr = VectorHelper.vectorToCSVString(vector, false);
>> +        writer.write(vecStr);
>> +      }
>> +    };
>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>> +    jwvw.write(iter);
>> +    jwvw.close();
>> +    CSVVectorIterable csvIter = new CSVVectorIterable(new StringReader(sWriter.getBuffer().toString()));
>> +    int count = 0;
>> +    for (Vector vector : csvIter) {
>> +      //System.out.println("Vec: " + vector);
>> +      count++;
>> +    }
>> +    assertEquals(50, count);
>> +  }
>> +}
>> 
>> Added: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java
>> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java?rev=1085397&view=auto
>> ==============================================================================
>> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java (added)
>> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorIterableTest.java Fri Mar 25 14:28:12 2011
>> @@ -0,0 +1,39 @@
>> +package org.apache.mahout.utils.vectors.io;
>> +
>> +import org.apache.hadoop.conf.Configuration;
>> +import org.apache.hadoop.fs.FileSystem;
>> +import org.apache.hadoop.fs.Path;
>> +import org.apache.hadoop.io.LongWritable;
>> +import org.apache.hadoop.io.SequenceFile;
>> +import org.apache.mahout.math.Vector;
>> +import org.apache.mahout.math.VectorWritable;
>> +import org.apache.mahout.utils.MahoutTestCase;
>> +import org.apache.mahout.utils.vectors.RandomVectorIterable;
>> +import org.junit.Test;
>> +
>> +
>> +/**
>> + *
>> + *
>> + **/
>> +public class SequenceFileVectorIterableTest extends MahoutTestCase {
>> +
>> +
>> +  @Test
>> +  public void testSFVI() throws Exception {
>> +    Path path = getTestTempFilePath("sfvw");
>> +    Configuration conf = new Configuration();
>> +    FileSystem fs = FileSystem.get(conf);
>> +    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, VectorWritable.class);
>> +    SequenceFileVectorWriter writer = new SequenceFileVectorWriter(seqWriter);
>> +    Iterable<Vector> iter = new RandomVectorIterable(50);
>> +    writer.write(iter);
>> +    writer.close();
>> +    SequenceFileVectorIterable sfVIter = new SequenceFileVectorIterable(fs, path, conf, false);
>> +    int count = 0;
>> +    for (Vector vector : sfVIter) {
>> +      count++;
>> +    }
>> +    assertEquals(50, count);
>> +  }
>> +}
>> 
>> 
>> 

--------------------------
Grant Ingersoll
http://www.lucidimagination.com/

Search the Lucene ecosystem docs using Solr/Lucene:
http://www.lucidimagination.com/search