You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2014/01/21 22:33:28 UTC
svn commit: r1560202 - in /mahout/trunk: CHANGELOG
core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
Author: ssc
Date: Tue Jan 21 21:33:28 2014
New Revision: 1560202
URL: http://svn.apache.org/r1560202
Log:
MAHOUT-1398 FileDataModel should provide a constructor with a delimiterPattern
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Tue Jan 21 21:33:28 2014
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.9 - unreleased
+ MAHOUT-1398: FileDataModel should provide a constructor with a delimiterPattern (Roy Guo via ssc)
+
MAHOUT-1400: Remove references to deprecated and removed algorithms from examples scripts (ssc)
MAHOUT-1396: Accidental use of commons-math won't work with next Hadoop 2 release (srowen)
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Tue Jan 21 21:33:28 2014
@@ -28,9 +28,6 @@ import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantLock;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
import org.apache.mahout.cf.taste.common.Refreshable;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
@@ -49,6 +46,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
/**
* <p>
@@ -148,7 +148,15 @@ public class FileDataModel extends Abstr
public FileDataModel(File dataFile) throws IOException {
this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS);
}
-
+
+ /**
+ * @param delimiterRegex If your data file don't use '\t' or ',' as delimiter, you can specify
+ * a custom regex pattern.
+ */
+ public FileDataModel(File dataFile, String delimiterRegex) throws IOException {
+ this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS, delimiterRegex);
+ }
+
/**
* @param transpose
* transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
@@ -158,6 +166,17 @@ public class FileDataModel extends Abstr
* @see #FileDataModel(File)
*/
public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException {
+ this(dataFile, transpose, minReloadIntervalMS, null);
+ }
+
+ /**
+ * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify
+ * user own using regex pattern.
+ * @throws IOException
+ */
+ public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex)
+ throws IOException {
+
this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
if (!dataFile.exists() || dataFile.isDirectory()) {
throw new FileNotFoundException(dataFile.toString());
@@ -178,8 +197,16 @@ public class FileDataModel extends Abstr
}
Closeables.close(iterator, true);
- delimiter = determineDelimiter(firstLine);
- delimiterPattern = Splitter.on(delimiter);
+ if (delimiterRegex == null) {
+ delimiter = determineDelimiter(firstLine);
+ delimiterPattern = Splitter.on(delimiter);
+ } else {
+ delimiter = '\0';
+ delimiterPattern = Splitter.onPattern(delimiterRegex);
+ if (!delimiterPattern.split(firstLine).iterator().hasNext()) {
+ throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line");
+ }
+ }
List<String> firstLineSplit = Lists.newArrayList();
for (String token : delimiterPattern.split(firstLine)) {
firstLineSplit.add(token);
@@ -198,10 +225,6 @@ public class FileDataModel extends Abstr
return dataFile;
}
- public char getDelimiter() {
- return delimiter;
- }
-
protected void reload() {
if (reloadLock.tryLock()) {
try {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java Tue Jan 21 21:33:28 2014
@@ -17,6 +17,9 @@
package org.apache.mahout.cf.taste.impl.model.file;
+import java.io.File;
+import java.util.NoSuchElementException;
+
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.TasteTestCase;
@@ -33,9 +36,6 @@ import org.apache.mahout.cf.taste.simila
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
-import java.util.NoSuchElementException;
-
/** <p>Tests {@link FileDataModel}.</p> */
public final class FileDataModelTest extends TasteTestCase {
@@ -55,6 +55,23 @@ public final class FileDataModelTest ext
"456,789,0.5",
"456,654,0.0",
"456,999,0.2",};
+
+ private static final String[] DATA_SPLITTED_WITH_TWO_SPACES = {
+ "123 456 0.1",
+ "123 789 0.6",
+ "123 654 0.7",
+ "234 123 0.5",
+ "234 234 1.0",
+ "234 999 0.9",
+ "345 789 0.6",
+ "345 654 0.7",
+ "345 123 1.0",
+ "345 234 0.5",
+ "345 999 0.5",
+ "456 456 0.1",
+ "456 789 0.5",
+ "456 654 0.0",
+ "456 999 0.2",};
private DataModel model;
private File testFile;
@@ -67,6 +84,15 @@ public final class FileDataModelTest ext
writeLines(testFile, DATA);
model = new FileDataModel(testFile);
}
+
+ @Test
+ public void testReadRegexSplittedFile() throws Exception {
+ File testFile = getTestTempFile("testRegex.txt");
+ writeLines(testFile, DATA_SPLITTED_WITH_TWO_SPACES);
+ FileDataModel model = new FileDataModel(testFile,"\\s+");
+ assertEquals(model.getItemIDsFromUser(123).size(), 3);
+ assertEquals(model.getItemIDsFromUser(456).size(), 4);
+ }
@Test
public void testFile() throws Exception {