You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/06/09 12:59:47 UTC

svn commit: r1491181 - in /mahout/trunk: ./ core/src/main/java/org/apache/mahout/classifier/df/ core/src/main/java/org/apache/mahout/classifier/df/data/ core/src/main/java/org/apache/mahout/classifier/df/tools/ core/src/test/java/org/apache/mahout/clas...

Author: ssc
Date: Sun Jun  9 10:59:47 2013
New Revision: 1491181

URL: http://svn.apache.org/r1491181
Log:
MAHOUT-1163: Make random forest classifier meta-data file human readable

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
    mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Jun  9 10:59:47 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+  MAHOUT-1163: Make random forest classifier meta-data file human readable (Marty Kube via ssc)
+
   MAHOUT-1243: Dictionary file format in Lucene-Mahout integration is not in SequenceFileFormat (ssc)
 
   MAHOUT-974:  org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob use integer as userId and itemId (ssc)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java Sun Jun  9 10:59:47 2013
@@ -30,14 +30,17 @@ import org.apache.mahout.common.iterator
 
 import java.io.DataInput;
 import java.io.DataOutput;
+import java.io.DataOutputStream;
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.List;
 
 /**
  * Utility class that contains various helper methods
  */
 public final class DFUtils {
-  private DFUtils() { }
+
+  private DFUtils() {}
   
   /**
    * Writes an Node[] into a DataOutput
@@ -157,4 +160,22 @@ public final class DFUtils {
       Closeables.closeQuietly(out);
     }
   }
+  
+  /**
+   * Write a string to a path.
+   * @param conf From which the file system will be picked
+   * @param path Where the string will be written
+   * @param string The string to write
+   * @throws IOException if things go poorly
+   */
+  public static void storeString(Configuration conf, Path path, String string) throws IOException {
+    DataOutputStream out = null;
+    try {
+      out = path.getFileSystem(conf).create(path);
+      out.write(string.getBytes(Charset.defaultCharset()));
+    } finally {
+      Closeables.closeQuietly(out);
+    }
+  }
+  
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java Sun Jun  9 10:59:47 2013
@@ -18,26 +18,28 @@
 package org.apache.mahout.classifier.df.data;
 
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.io.Closeables;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.mahout.classifier.df.DFUtils;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.type.TypeReference;
 
-import java.io.DataInput;
-import java.io.DataOutput;
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
+import java.util.Map;
 
 /**
  * Contains informations about the attributes.
  */
-public class Dataset implements Writable {
+public class Dataset {
 
   /**
    * Attributes type
@@ -63,6 +65,19 @@ public class Dataset implements Writable
     public boolean isIgnored() {
       return this == IGNORED;
     }
+    
+    private static Attribute fromString(String from) {
+      
+      Attribute toReturn = LABEL;
+      if(NUMERICAL.toString().equalsIgnoreCase(from)) {
+        toReturn = NUMERICAL;
+      } else if (CATEGORICAL.toString().equalsIgnoreCase(from)) {
+        toReturn = CATEGORICAL;
+      } else if (IGNORED.toString().equalsIgnoreCase(from)) {
+        toReturn = IGNORED;
+      }
+      return toReturn;
+    }
   }
 
   private Attribute[] attributes;
@@ -86,8 +101,16 @@ public class Dataset implements Writable
    * number of instances in the dataset
    */
   private int nbInstances;
+  
+  /** JSON serial/de-serial-izer */
+  private static final ObjectMapper objectMapper = new ObjectMapper();
+
+  // Some literals for JSON representation
+  final static String TYPE = "type";
+  final static String VALUES = "values";
+  final static String LABEL = "label";
 
-  private Dataset() {
+  protected Dataset() {
   }
 
   /**
@@ -161,9 +184,9 @@ public class Dataset implements Writable
   public double getLabel(Instance instance) {
     return instance.get(getLabelId());
   }
-
-  public int nbInstances() {
-    return nbInstances;
+  
+  public Attribute getAttribute(int attr) {
+	  return attributes[attr];
   }
 
   /**
@@ -190,11 +213,15 @@ public class Dataset implements Writable
     }
     return values[labelId][(int) code];
   }
+  
+  public String toString() {
+	  return "attributes="+Arrays.toString(attributes);
+  }
 
   /**
-   * Converts a token to its corresponding int code for a given attribute
+   * Converts a token to its corresponding integer code for a given attribute
    *
-   * @param attr attribute's index
+   * @param attr attribute index
    */
   public int valueOf(int attr, String token) {
     Preconditions.checkArgument(!isNumerical(attr), "Only for CATEGORICAL attributes");
@@ -206,7 +233,6 @@ public class Dataset implements Writable
     return ignored;
   }
 
-
   /**
    * @return number of attributes that are not IGNORED
    */
@@ -294,63 +320,110 @@ public class Dataset implements Writable
    * @throws java.io.IOException
    */
   public static Dataset load(Configuration conf, Path path) throws IOException {
+
     FileSystem fs = path.getFileSystem(conf);
+    long bytesToRead = fs.getFileStatus(path).getLen();
+    byte[] buff = new byte[new Long(bytesToRead).intValue()];
     FSDataInputStream input = fs.open(path);
     try {
-      return read(input);
+      input.readFully(buff);
     } finally {
       Closeables.closeQuietly(input);
     }
+    String json = new String(buff, Charset.defaultCharset());
+    return fromJSON(json);
   }
+  
 
-  public static Dataset read(DataInput in) throws IOException {
-    Dataset dataset = new Dataset();
-
-    dataset.readFields(in);
-    return dataset;
-  }
-
-  @Override
-  public void readFields(DataInput in) throws IOException {
-    int nbAttributes = in.readInt();
-    attributes = new Attribute[nbAttributes];
-    for (int attr = 0; attr < nbAttributes; attr++) {
-      String name = WritableUtils.readString(in);
-      attributes[attr] = Attribute.valueOf(name);
-    }
-
-    ignored = DFUtils.readIntArray(in);
-
-    // only CATEGORICAL attributes have values
-    values = new String[nbAttributes][];
-    for (int attr = 0; attr < nbAttributes; attr++) {
-      if (attributes[attr].isCategorical()) {
-        values[attr] = WritableUtils.readStringArray(in);
+  /**
+   * Serialize this instance to JSON
+   * @return some JSON
+   */
+  public String toJSON() {
+
+    List<Map<String, Object>> toWrite = Lists.newLinkedList();
+    // attributes does not include ignored columns and it does include the class label
+    int ignoredCount = 0;
+    for (int i = 0; i < attributes.length + ignored.length; i++) {
+      Map<String, Object> attribute = null;
+      int attributesIndex = i - ignoredCount;
+      if (ignoredCount < ignored.length && i == ignored[ignoredCount]) {
+        // fill in ignored atttribute
+        attribute = getMap(Attribute.IGNORED, null, false);
+        ignoredCount++;
+      } else if (attributesIndex == labelId) {
+        // fill in the label
+        attribute = getMap(attributes[attributesIndex], values[attributesIndex], true);
+      } else  {
+        // normal attribute
+        attribute = getMap(attributes[attributesIndex], values[attributesIndex], false);
       }
+      toWrite.add(attribute);
     }
-
-    labelId = in.readInt();
-    nbInstances = in.readInt();
-  }
-
-  @Override
-  public void write(DataOutput out) throws IOException {
-    out.writeInt(attributes.length); // nb attributes
-    for (Attribute attr : attributes) {
-      WritableUtils.writeString(out, attr.name());
+    try {
+      return objectMapper.writeValueAsString(toWrite);
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
     }
+  }
 
-    DFUtils.writeArray(out, ignored);
+  /**
+   * De-serialize an instance from a string
+   * @param json From which an instance is created
+   * @return A shinny new Dataset
+   */
+  public static Dataset fromJSON(String json) {
 
-    // only CATEGORICAL attributes have values
-    for (String[] vals : values) {
-      if (vals != null) {
-        WritableUtils.writeStringArray(out, vals);
+    Dataset dataset = new Dataset();
+    List<Map<String, Object>> fromJSON;
+    try {
+       fromJSON = objectMapper.readValue(json, new TypeReference<List<Map<String, Object>>>() {});
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }
+    List<Attribute> attributes = Lists.newLinkedList();
+    List<Integer> ignored = Lists.newLinkedList();
+    String[][] nominalValues = new String[fromJSON.size()][];
+    for (int i = 0; i < fromJSON.size(); i++) {
+      Map<String, Object> attribute = fromJSON.get(i);
+      if(Attribute.fromString((String) attribute.get(TYPE)) == Attribute.IGNORED) {
+        ignored.add(i);
+      } else {
+        Attribute asAttribute = Attribute.fromString((String) attribute.get(TYPE));
+        attributes.add(asAttribute);
+        if((Boolean) attribute.get(LABEL)) {
+          dataset.labelId = i - ignored.size();
+        }
+        if(attribute.get(VALUES) != null) {
+          List get = (List) attribute.get(VALUES);
+          String[] array = (String[]) get.toArray(new String[]{});
+          nominalValues[i] = array;
+        }
       }
     }
-
-    out.writeInt(labelId);
-    out.writeInt(nbInstances);
+    dataset.attributes = attributes.toArray(new Attribute[]{});
+    dataset.ignored = new int[ignored.size()];
+    dataset.values = nominalValues;
+    for(int i = 0; i < dataset.ignored.length; i++) {
+      dataset.ignored[i] = ignored.get(i);
+    }
+    return dataset;
+  }
+  
+  /**
+   * Generate a map to describe an attribute
+   * @param type The type
+   * @param values
+   * @param isLabel
+   * @return 
+   */
+  private Map<String, Object> getMap(Attribute type, String[] values, boolean isLabel) {
+
+    Map<String, Object> attribute = Maps.newHashMap();
+    attribute.put(TYPE, type.toString().toLowerCase(Locale.getDefault()));
+    attribute.put(VALUES, values);
+    attribute.put(LABEL, isLabel);
+    return attribute;
   }
 
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java Sun Jun  9 10:59:47 2013
@@ -49,8 +49,7 @@ public final class Describe {
 
   private static final Logger log = LoggerFactory.getLogger(Describe.class);
 
-  private Describe() {
-  }
+  private Describe() {}
 
   public static void main(String[] args) throws IOException, DescriptorException {
 
@@ -116,7 +115,8 @@ public final class Describe {
     Dataset dataset = generateDataset(descriptor, dataPath, regression);
 
     log.info("storing the dataset description");
-    DFUtils.storeWritable(new Configuration(), fPath, dataset);
+    String json = dataset.toJSON();
+    DFUtils.storeString(new Configuration(), fPath, json);
   }
 
   private static Dataset generateDataset(String descriptor, String dataPath, boolean regression) throws IOException,

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java Sun Jun  9 10:59:47 2013
@@ -52,12 +52,9 @@ public final class TreeVisualizer {
       if (node instanceof CategoricalNode) {
         CategoricalNode cnode = (CategoricalNode) node;
         int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
-        double[] values = (double[]) fields.get("CategoricalNode.values").get(
-            cnode);
-        Node[] childs = (Node[]) fields.get("CategoricalNode.childs")
-            .get(cnode);
-        String[][] attrValues = (String[][]) fields.get("Dataset.values").get(
-            dataset);
+        double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+        Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+        String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
         for (int i = 0; i < attrValues[attr].length; i++) {
           int index = ArrayUtils.indexOf(values, i);
           if (index < 0) {
@@ -69,8 +66,7 @@ public final class TreeVisualizer {
           }
           buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
               .append(attrValues[attr][i]);
-          buff.append(toStringNode(childs[index], dataset, attrNames, fields,
-              layer + 1));
+          buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
         }
       } else if (node instanceof NumericalNode) {
         NumericalNode nnode = (NumericalNode) node;

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java Sun Jun  9 10:59:47 2013
@@ -56,8 +56,7 @@ public final class UDistrib {
   
   private static final Logger log = LoggerFactory.getLogger(UDistrib.class);
   
-  private UDistrib() {
-  }
+  private UDistrib() {}
   
   /**
    * Launch the uniform distribution tool. Requires the following command line arguments:<br>
@@ -161,12 +160,11 @@ public final class UDistrib {
     FSDataInputStream input = ifs.open(dataPath);
     Scanner scanner = new Scanner(input, "UTF-8");
     DataConverter converter = new DataConverter(dataset);
-    int nbInstances = dataset.nbInstances();
     
     int id = 0;
     while (scanner.hasNextLine()) {
       if (id % 1000 == 0) {
-        log.info("progress : {} / {}", id, nbInstances);
+        log.info("progress : {}", id);
       }
       
       String line = scanner.nextLine();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java Sun Jun  9 10:59:47 2013
@@ -1,71 +1,55 @@
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
  */
-
 package org.apache.mahout.classifier.df.data;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.Random;
 
 import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.RandomUtils;
 import org.junit.Test;
 
 public final class DatasetTest extends MahoutTestCase {
 
-  private static final int NUM_ATTRIBUTES = 10;
-
-  private static Dataset readDataset(byte[] bytes) throws IOException {
-    ByteArrayInputStream byteInStream = new ByteArrayInputStream(bytes);
-    DataInput in = new DataInputStream(byteInStream);
-    return Dataset.read(in);
-  }
-
   @Test
-  public void testWritable() throws Exception {
+  public void jsonEncoding() throws DescriptorException {
+
+    String json = "["
+            + "{\"values\":null,\"label\":false,\"type\":\"numerical\"},"
+            + "{\"values\":[\"foo\",\"bar\"],\"label\":false,\"type\":\"categorical\"},"
+            + "{\"values\":null,\"label\":false,\"type\":\"ignored\"},"
+            + "{\"values\":null,\"label\":true,\"type\":\"numerical\"}"
+            + "]";
+    Dataset to = DataLoader.generateDataset("N C I L", true, new String[]{"1 foo 2 3", "4 bar 5 6"});
+
+    // to JSON
+    assertEquals(json, to.toJSON());
+    assertEquals(3, to.nbAttributes());
+    assertEquals(1, to.getIgnored().length);
+    assertEquals(2, to.getIgnored()[0]);
+    assertEquals(2, to.getLabelId());
+    assertTrue(to.isNumerical(0));
+
+    // from JSON
+    Dataset fromJson = new Dataset().fromJSON(json);
+    assertEquals(3, fromJson.nbAttributes());
+    assertEquals(1, fromJson.getIgnored().length);
+    assertEquals(2, fromJson.getIgnored()[0]);
+    assertTrue(fromJson.isNumerical(0));
+    
+    // read values for a nominal
+    assertEquals(0, fromJson.valueOf(1, "foo"));
 
-    Random rng = RandomUtils.getRandom();
-    ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
-    DataOutput out = new DataOutputStream(byteOutStream);
-
-    int n = 10;
-    for (int nloop = 0; nloop < n; nloop++) {
-      byteOutStream.reset();
-      
-      Dataset dataset = Utils.randomData(rng, NUM_ATTRIBUTES, false, 1).getDataset();
-      
-      dataset.write(out);
-      
-      assertEquals(dataset, readDataset(byteOutStream.toByteArray()));
-
-      // regression
-      byteOutStream.reset();
-      
-      dataset = Utils.randomData(rng, NUM_ATTRIBUTES, true, 1).getDataset();
-      
-      dataset.write(out);
-      
-      assertEquals(dataset, readDataset(byteOutStream.toByteArray()));
-    }
   }
-  
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java Sun Jun  9 10:59:47 2013
@@ -38,8 +38,8 @@ import org.apache.mahout.classifier.df.d
  *
  */
 public final class Utils {
-  private Utils() {
-  }
+
+  private Utils() {}
 
   /** Used when generating random CATEGORICAL values */
   private static final int CATEGORICAL_RANGE = 100;
@@ -82,8 +82,8 @@ public final class Utils {
   public static String generateDescriptor(char[] tokens) {
     StringBuilder builder = new StringBuilder();
 
-    for (char token1 : tokens) {
-      builder.append(token1).append(' ');
+    for (char token : tokens) {
+      builder.append(token).append(' ');
     }
 
     return builder.toString();
@@ -102,34 +102,14 @@ public final class Utils {
   }
 
   /**
-   * generates random data
-   * 
-   * @param rng Random number generator
-   * @param nbAttributes number of attributes
-   * @param regression true is the label is numerical
-   * @param number of data lines to generate
-   */
-  public static double[][] randomDoubles(Random rng, int nbAttributes, boolean regression, int number) throws DescriptorException {
-    String descriptor = randomDescriptor(rng, nbAttributes);
-    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
-
-    double[][] data = new double[number][];
-
-    for (int index = 0; index < number; index++) {
-      data[index] = randomVector(rng, attrs, regression);
-    }
-
-    return data;
-  }
-
-  /**
    * generates random data based on the given descriptor
    * 
    * @param rng Random number generator
    * @param descriptor attributes description
    * @param number number of data lines to generate
    */
-  public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number) throws DescriptorException {
+  public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number)
+    throws DescriptorException {
     Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
 
     double[][] data = new double[number][];
@@ -255,8 +235,9 @@ public final class Utils {
   }
 
   private static void writeDataToFile(String[] sData, Path path) throws IOException {
-    BufferedWriter output = Files.newWriter(new File(path.toString()), Charsets.UTF_8);
+    BufferedWriter output = null;
     try {
+      output = Files.newWriter(new File(path.toString()), Charsets.UTF_8);
       for (String line : sData) {
         output.write(line);
         output.write('\n');
@@ -281,25 +262,6 @@ public final class Utils {
     return path;
   }
 
-  public static Path writeDatasetToTestFile(Dataset dataset) throws IOException {
-    Path testData = new Path("testdata/Dataset");
-    FileSystem fs = testData.getFileSystem(new Configuration());
-    if (!fs.exists(testData)) {
-      fs.mkdirs(testData);
-    }
-  
-    Path datasetPath = new Path(testData, "dataset.info");
-    FSDataOutputStream out = fs.create(datasetPath);
-  
-    try {
-      dataset.write(out);
-    } finally {
-      Closeables.closeQuietly(out);
-    }
-  
-    return datasetPath;
-  }
-
   /**
    * Split the data into numMaps splits
    */
@@ -311,8 +273,7 @@ public final class Utils {
   
     for (int partition = 0; partition < numMaps; partition++) {
       int from = partition * partitionSize;
-      int to = partition == (numMaps - 1) ? nbInstances : (partition + 1)
-          * partitionSize;
+      int to = partition == (numMaps - 1) ? nbInstances : (partition + 1) * partitionSize;
   
       splits[partition] = Arrays.copyOfRange(sData, from, to);
     }