You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/06/09 12:59:47 UTC
svn commit: r1491181 - in /mahout/trunk: ./
core/src/main/java/org/apache/mahout/classifier/df/
core/src/main/java/org/apache/mahout/classifier/df/data/
core/src/main/java/org/apache/mahout/classifier/df/tools/
core/src/test/java/org/apache/mahout/clas...
Author: ssc
Date: Sun Jun 9 10:59:47 2013
New Revision: 1491181
URL: http://svn.apache.org/r1491181
Log:
MAHOUT-1163: Make random forest classifier meta-data file human readable
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Jun 9 10:59:47 2013
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.8 - unreleased
+ MAHOUT-1163: Make random forest classifier meta-data file human readable (Marty Kube via ssc)
+
MAHOUT-1243: Dictionary file format in Lucene-Mahout integration is not in SequenceFileFormat (ssc)
MAHOUT-974: org.apache.mahout.cf.taste.hadoop.als.ParallelALSFactorizationJob use integer as userId and itemId (ssc)
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/DFUtils.java Sun Jun 9 10:59:47 2013
@@ -30,14 +30,17 @@ import org.apache.mahout.common.iterator
import java.io.DataInput;
import java.io.DataOutput;
+import java.io.DataOutputStream;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.List;
/**
* Utility class that contains various helper methods
*/
public final class DFUtils {
- private DFUtils() { }
+
+ private DFUtils() {}
/**
* Writes an Node[] into a DataOutput
@@ -157,4 +160,22 @@ public final class DFUtils {
Closeables.closeQuietly(out);
}
}
+
+ /**
+ * Write a string to a path.
+ * @param conf From which the file system will be picked
+ * @param path Where the string will be written
+ * @param string The string to write
+ * @throws IOException if things go poorly
+ */
+ public static void storeString(Configuration conf, Path path, String string) throws IOException {
+ DataOutputStream out = null;
+ try {
+ out = path.getFileSystem(conf).create(path);
+ out.write(string.getBytes(Charset.defaultCharset()));
+ } finally {
+ Closeables.closeQuietly(out);
+ }
+ }
+
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java Sun Jun 9 10:59:47 2013
@@ -18,26 +18,28 @@
package org.apache.mahout.classifier.df.data;
import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.mahout.classifier.df.DFUtils;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.type.TypeReference;
-import java.io.DataInput;
-import java.io.DataOutput;
import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;
+import java.util.Locale;
+import java.util.Map;
/**
* Contains informations about the attributes.
*/
-public class Dataset implements Writable {
+public class Dataset {
/**
* Attributes type
@@ -63,6 +65,19 @@ public class Dataset implements Writable
public boolean isIgnored() {
return this == IGNORED;
}
+
+ private static Attribute fromString(String from) {
+
+ Attribute toReturn = LABEL;
+ if(NUMERICAL.toString().equalsIgnoreCase(from)) {
+ toReturn = NUMERICAL;
+ } else if (CATEGORICAL.toString().equalsIgnoreCase(from)) {
+ toReturn = CATEGORICAL;
+ } else if (IGNORED.toString().equalsIgnoreCase(from)) {
+ toReturn = IGNORED;
+ }
+ return toReturn;
+ }
}
private Attribute[] attributes;
@@ -86,8 +101,16 @@ public class Dataset implements Writable
* number of instances in the dataset
*/
private int nbInstances;
+
+ /** JSON serial/de-serial-izer */
+ private static final ObjectMapper objectMapper = new ObjectMapper();
+
+ // Some literals for JSON representation
+ final static String TYPE = "type";
+ final static String VALUES = "values";
+ final static String LABEL = "label";
- private Dataset() {
+ protected Dataset() {
}
/**
@@ -161,9 +184,9 @@ public class Dataset implements Writable
public double getLabel(Instance instance) {
return instance.get(getLabelId());
}
-
- public int nbInstances() {
- return nbInstances;
+
+ public Attribute getAttribute(int attr) {
+ return attributes[attr];
}
/**
@@ -190,11 +213,15 @@ public class Dataset implements Writable
}
return values[labelId][(int) code];
}
+
+ public String toString() {
+ return "attributes="+Arrays.toString(attributes);
+ }
/**
- * Converts a token to its corresponding int code for a given attribute
+ * Converts a token to its corresponding integer code for a given attribute
*
- * @param attr attribute's index
+ * @param attr attribute index
*/
public int valueOf(int attr, String token) {
Preconditions.checkArgument(!isNumerical(attr), "Only for CATEGORICAL attributes");
@@ -206,7 +233,6 @@ public class Dataset implements Writable
return ignored;
}
-
/**
* @return number of attributes that are not IGNORED
*/
@@ -294,63 +320,110 @@ public class Dataset implements Writable
* @throws java.io.IOException
*/
public static Dataset load(Configuration conf, Path path) throws IOException {
+
FileSystem fs = path.getFileSystem(conf);
+ long bytesToRead = fs.getFileStatus(path).getLen();
+ byte[] buff = new byte[new Long(bytesToRead).intValue()];
FSDataInputStream input = fs.open(path);
try {
- return read(input);
+ input.readFully(buff);
} finally {
Closeables.closeQuietly(input);
}
+ String json = new String(buff, Charset.defaultCharset());
+ return fromJSON(json);
}
+
- public static Dataset read(DataInput in) throws IOException {
- Dataset dataset = new Dataset();
-
- dataset.readFields(in);
- return dataset;
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- int nbAttributes = in.readInt();
- attributes = new Attribute[nbAttributes];
- for (int attr = 0; attr < nbAttributes; attr++) {
- String name = WritableUtils.readString(in);
- attributes[attr] = Attribute.valueOf(name);
- }
-
- ignored = DFUtils.readIntArray(in);
-
- // only CATEGORICAL attributes have values
- values = new String[nbAttributes][];
- for (int attr = 0; attr < nbAttributes; attr++) {
- if (attributes[attr].isCategorical()) {
- values[attr] = WritableUtils.readStringArray(in);
+ /**
+ * Serialize this instance to JSON
+ * @return some JSON
+ */
+ public String toJSON() {
+
+ List<Map<String, Object>> toWrite = Lists.newLinkedList();
+ // attributes does not include ignored columns and it does include the class label
+ int ignoredCount = 0;
+ for (int i = 0; i < attributes.length + ignored.length; i++) {
+ Map<String, Object> attribute = null;
+ int attributesIndex = i - ignoredCount;
+ if (ignoredCount < ignored.length && i == ignored[ignoredCount]) {
+ // fill in ignored atttribute
+ attribute = getMap(Attribute.IGNORED, null, false);
+ ignoredCount++;
+ } else if (attributesIndex == labelId) {
+ // fill in the label
+ attribute = getMap(attributes[attributesIndex], values[attributesIndex], true);
+ } else {
+ // normal attribute
+ attribute = getMap(attributes[attributesIndex], values[attributesIndex], false);
}
+ toWrite.add(attribute);
}
-
- labelId = in.readInt();
- nbInstances = in.readInt();
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(attributes.length); // nb attributes
- for (Attribute attr : attributes) {
- WritableUtils.writeString(out, attr.name());
+ try {
+ return objectMapper.writeValueAsString(toWrite);
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
}
+ }
- DFUtils.writeArray(out, ignored);
+ /**
+ * De-serialize an instance from a string
+ * @param json From which an instance is created
+ * @return A shinny new Dataset
+ */
+ public static Dataset fromJSON(String json) {
- // only CATEGORICAL attributes have values
- for (String[] vals : values) {
- if (vals != null) {
- WritableUtils.writeStringArray(out, vals);
+ Dataset dataset = new Dataset();
+ List<Map<String, Object>> fromJSON;
+ try {
+ fromJSON = objectMapper.readValue(json, new TypeReference<List<Map<String, Object>>>() {});
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
+ }
+ List<Attribute> attributes = Lists.newLinkedList();
+ List<Integer> ignored = Lists.newLinkedList();
+ String[][] nominalValues = new String[fromJSON.size()][];
+ for (int i = 0; i < fromJSON.size(); i++) {
+ Map<String, Object> attribute = fromJSON.get(i);
+ if(Attribute.fromString((String) attribute.get(TYPE)) == Attribute.IGNORED) {
+ ignored.add(i);
+ } else {
+ Attribute asAttribute = Attribute.fromString((String) attribute.get(TYPE));
+ attributes.add(asAttribute);
+ if((Boolean) attribute.get(LABEL)) {
+ dataset.labelId = i - ignored.size();
+ }
+ if(attribute.get(VALUES) != null) {
+ List get = (List) attribute.get(VALUES);
+ String[] array = (String[]) get.toArray(new String[]{});
+ nominalValues[i] = array;
+ }
}
}
-
- out.writeInt(labelId);
- out.writeInt(nbInstances);
+ dataset.attributes = attributes.toArray(new Attribute[]{});
+ dataset.ignored = new int[ignored.size()];
+ dataset.values = nominalValues;
+ for(int i = 0; i < dataset.ignored.length; i++) {
+ dataset.ignored[i] = ignored.get(i);
+ }
+ return dataset;
+ }
+
+ /**
+ * Generate a map to describe an attribute
+ * @param type The type
+ * @param values
+ * @param isLabel
+ * @return
+ */
+ private Map<String, Object> getMap(Attribute type, String[] values, boolean isLabel) {
+
+ Map<String, Object> attribute = Maps.newHashMap();
+ attribute.put(TYPE, type.toString().toLowerCase(Locale.getDefault()));
+ attribute.put(VALUES, values);
+ attribute.put(LABEL, isLabel);
+ return attribute;
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java Sun Jun 9 10:59:47 2013
@@ -49,8 +49,7 @@ public final class Describe {
private static final Logger log = LoggerFactory.getLogger(Describe.class);
- private Describe() {
- }
+ private Describe() {}
public static void main(String[] args) throws IOException, DescriptorException {
@@ -116,7 +115,8 @@ public final class Describe {
Dataset dataset = generateDataset(descriptor, dataPath, regression);
log.info("storing the dataset description");
- DFUtils.storeWritable(new Configuration(), fPath, dataset);
+ String json = dataset.toJSON();
+ DFUtils.storeString(new Configuration(), fPath, json);
}
private static Dataset generateDataset(String descriptor, String dataPath, boolean regression) throws IOException,
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java Sun Jun 9 10:59:47 2013
@@ -52,12 +52,9 @@ public final class TreeVisualizer {
if (node instanceof CategoricalNode) {
CategoricalNode cnode = (CategoricalNode) node;
int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
- double[] values = (double[]) fields.get("CategoricalNode.values").get(
- cnode);
- Node[] childs = (Node[]) fields.get("CategoricalNode.childs")
- .get(cnode);
- String[][] attrValues = (String[][]) fields.get("Dataset.values").get(
- dataset);
+ double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+ Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+ String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
for (int i = 0; i < attrValues[attr].length; i++) {
int index = ArrayUtils.indexOf(values, i);
if (index < 0) {
@@ -69,8 +66,7 @@ public final class TreeVisualizer {
}
buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
.append(attrValues[attr][i]);
- buff.append(toStringNode(childs[index], dataset, attrNames, fields,
- layer + 1));
+ buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
}
} else if (node instanceof NumericalNode) {
NumericalNode nnode = (NumericalNode) node;
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java Sun Jun 9 10:59:47 2013
@@ -56,8 +56,7 @@ public final class UDistrib {
private static final Logger log = LoggerFactory.getLogger(UDistrib.class);
- private UDistrib() {
- }
+ private UDistrib() {}
/**
* Launch the uniform distribution tool. Requires the following command line arguments:<br>
@@ -161,12 +160,11 @@ public final class UDistrib {
FSDataInputStream input = ifs.open(dataPath);
Scanner scanner = new Scanner(input, "UTF-8");
DataConverter converter = new DataConverter(dataset);
- int nbInstances = dataset.nbInstances();
int id = 0;
while (scanner.hasNextLine()) {
if (id % 1000 == 0) {
- log.info("progress : {} / {}", id, nbInstances);
+ log.info("progress : {}", id);
}
String line = scanner.nextLine();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/DatasetTest.java Sun Jun 9 10:59:47 2013
@@ -1,71 +1,55 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
*/
-
package org.apache.mahout.classifier.df.data;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.Random;
import org.apache.mahout.common.MahoutTestCase;
-import org.apache.mahout.common.RandomUtils;
import org.junit.Test;
public final class DatasetTest extends MahoutTestCase {
- private static final int NUM_ATTRIBUTES = 10;
-
- private static Dataset readDataset(byte[] bytes) throws IOException {
- ByteArrayInputStream byteInStream = new ByteArrayInputStream(bytes);
- DataInput in = new DataInputStream(byteInStream);
- return Dataset.read(in);
- }
-
@Test
- public void testWritable() throws Exception {
+ public void jsonEncoding() throws DescriptorException {
+
+ String json = "["
+ + "{\"values\":null,\"label\":false,\"type\":\"numerical\"},"
+ + "{\"values\":[\"foo\",\"bar\"],\"label\":false,\"type\":\"categorical\"},"
+ + "{\"values\":null,\"label\":false,\"type\":\"ignored\"},"
+ + "{\"values\":null,\"label\":true,\"type\":\"numerical\"}"
+ + "]";
+ Dataset to = DataLoader.generateDataset("N C I L", true, new String[]{"1 foo 2 3", "4 bar 5 6"});
+
+ // to JSON
+ assertEquals(json, to.toJSON());
+ assertEquals(3, to.nbAttributes());
+ assertEquals(1, to.getIgnored().length);
+ assertEquals(2, to.getIgnored()[0]);
+ assertEquals(2, to.getLabelId());
+ assertTrue(to.isNumerical(0));
+
+ // from JSON
+ Dataset fromJson = new Dataset().fromJSON(json);
+ assertEquals(3, fromJson.nbAttributes());
+ assertEquals(1, fromJson.getIgnored().length);
+ assertEquals(2, fromJson.getIgnored()[0]);
+ assertTrue(fromJson.isNumerical(0));
+
+ // read values for a nominal
+ assertEquals(0, fromJson.valueOf(1, "foo"));
- Random rng = RandomUtils.getRandom();
- ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
- DataOutput out = new DataOutputStream(byteOutStream);
-
- int n = 10;
- for (int nloop = 0; nloop < n; nloop++) {
- byteOutStream.reset();
-
- Dataset dataset = Utils.randomData(rng, NUM_ATTRIBUTES, false, 1).getDataset();
-
- dataset.write(out);
-
- assertEquals(dataset, readDataset(byteOutStream.toByteArray()));
-
- // regression
- byteOutStream.reset();
-
- dataset = Utils.randomData(rng, NUM_ATTRIBUTES, true, 1).getDataset();
-
- dataset.write(out);
-
- assertEquals(dataset, readDataset(byteOutStream.toByteArray()));
- }
}
-
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java?rev=1491181&r1=1491180&r2=1491181&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/df/data/Utils.java Sun Jun 9 10:59:47 2013
@@ -38,8 +38,8 @@ import org.apache.mahout.classifier.df.d
*
*/
public final class Utils {
- private Utils() {
- }
+
+ private Utils() {}
/** Used when generating random CATEGORICAL values */
private static final int CATEGORICAL_RANGE = 100;
@@ -82,8 +82,8 @@ public final class Utils {
public static String generateDescriptor(char[] tokens) {
StringBuilder builder = new StringBuilder();
- for (char token1 : tokens) {
- builder.append(token1).append(' ');
+ for (char token : tokens) {
+ builder.append(token).append(' ');
}
return builder.toString();
@@ -102,34 +102,14 @@ public final class Utils {
}
/**
- * generates random data
- *
- * @param rng Random number generator
- * @param nbAttributes number of attributes
- * @param regression true is the label is numerical
- * @param number of data lines to generate
- */
- public static double[][] randomDoubles(Random rng, int nbAttributes, boolean regression, int number) throws DescriptorException {
- String descriptor = randomDescriptor(rng, nbAttributes);
- Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
-
- double[][] data = new double[number][];
-
- for (int index = 0; index < number; index++) {
- data[index] = randomVector(rng, attrs, regression);
- }
-
- return data;
- }
-
- /**
* generates random data based on the given descriptor
*
* @param rng Random number generator
* @param descriptor attributes description
* @param number number of data lines to generate
*/
- public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number) throws DescriptorException {
+ public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number)
+ throws DescriptorException {
Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
double[][] data = new double[number][];
@@ -255,8 +235,9 @@ public final class Utils {
}
private static void writeDataToFile(String[] sData, Path path) throws IOException {
- BufferedWriter output = Files.newWriter(new File(path.toString()), Charsets.UTF_8);
+ BufferedWriter output = null;
try {
+ output = Files.newWriter(new File(path.toString()), Charsets.UTF_8);
for (String line : sData) {
output.write(line);
output.write('\n');
@@ -281,25 +262,6 @@ public final class Utils {
return path;
}
- public static Path writeDatasetToTestFile(Dataset dataset) throws IOException {
- Path testData = new Path("testdata/Dataset");
- FileSystem fs = testData.getFileSystem(new Configuration());
- if (!fs.exists(testData)) {
- fs.mkdirs(testData);
- }
-
- Path datasetPath = new Path(testData, "dataset.info");
- FSDataOutputStream out = fs.create(datasetPath);
-
- try {
- dataset.write(out);
- } finally {
- Closeables.closeQuietly(out);
- }
-
- return datasetPath;
- }
-
/**
* Split the data into numMaps splits
*/
@@ -311,8 +273,7 @@ public final class Utils {
for (int partition = 0; partition < numMaps; partition++) {
int from = partition * partitionSize;
- int to = partition == (numMaps - 1) ? nbInstances : (partition + 1)
- * partitionSize;
+ int to = partition == (numMaps - 1) ? nbInstances : (partition + 1) * partitionSize;
splits[partition] = Arrays.copyOfRange(sData, from, to);
}