You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2014/01/21 22:33:28 UTC

svn commit: r1560202 - in /mahout/trunk: CHANGELOG core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java

Author: ssc
Date: Tue Jan 21 21:33:28 2014
New Revision: 1560202

URL: http://svn.apache.org/r1560202
Log:
MAHOUT-1398 FileDataModel should provide a constructor with a delimiterPattern

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Tue Jan 21 21:33:28 2014
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.9 - unreleased
 
+  MAHOUT-1398: FileDataModel should provide a constructor with a delimiterPattern (Roy Guo via ssc)
+
   MAHOUT-1400: Remove references to deprecated and removed algorithms from examples scripts (ssc)
 
   MAHOUT-1396: Accidental use of commons-math won't work with next Hadoop 2 release (srowen)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Tue Jan 21 21:33:28 2014
@@ -28,9 +28,6 @@ import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.locks.ReentrantLock;
 
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
 import org.apache.mahout.cf.taste.common.Refreshable;
 import org.apache.mahout.cf.taste.common.TasteException;
 import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
@@ -49,6 +46,9 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.google.common.base.Preconditions;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
 
 /**
  * <p>
@@ -148,7 +148,15 @@ public class FileDataModel extends Abstr
   public FileDataModel(File dataFile) throws IOException {
     this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS);
   }
-
+  
+  /**
+   * @param delimiterRegex If your data file don't use '\t' or ',' as delimiter, you can specify 
+   * a custom regex pattern.
+   */
+  public FileDataModel(File dataFile, String delimiterRegex) throws IOException {
+    this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS, delimiterRegex);
+  }
+  
   /**
    * @param transpose
    *          transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
@@ -158,6 +166,17 @@ public class FileDataModel extends Abstr
    * @see #FileDataModel(File)
    */
   public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException {
+    this(dataFile, transpose, minReloadIntervalMS, null);
+  }
+  
+  /**
+   * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify 
+   * user own using regex pattern.
+   * @throws IOException
+   */
+  public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex)
+    throws IOException {
+
     this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
     if (!dataFile.exists() || dataFile.isDirectory()) {
       throw new FileNotFoundException(dataFile.toString());
@@ -178,8 +197,16 @@ public class FileDataModel extends Abstr
     }
     Closeables.close(iterator, true);
 
-    delimiter = determineDelimiter(firstLine);
-    delimiterPattern = Splitter.on(delimiter);
+    if (delimiterRegex == null) {
+      delimiter = determineDelimiter(firstLine);
+      delimiterPattern = Splitter.on(delimiter);
+    } else {
+      delimiter = '\0';
+      delimiterPattern = Splitter.onPattern(delimiterRegex);
+      if (!delimiterPattern.split(firstLine).iterator().hasNext()) {
+        throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line");
+      }
+    }
     List<String> firstLineSplit = Lists.newArrayList();
     for (String token : delimiterPattern.split(firstLine)) {
       firstLineSplit.add(token);
@@ -198,10 +225,6 @@ public class FileDataModel extends Abstr
     return dataFile;
   }
 
-  public char getDelimiter() {
-    return delimiter;
-  }
-
   protected void reload() {
     if (reloadLock.tryLock()) {
       try {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java?rev=1560202&r1=1560201&r2=1560202&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModelTest.java Tue Jan 21 21:33:28 2014
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.cf.taste.impl.model.file;
 
+import java.io.File;
+import java.util.NoSuchElementException;
+
 import org.apache.commons.lang3.mutable.MutableBoolean;
 import org.apache.mahout.cf.taste.common.TasteException;
 import org.apache.mahout.cf.taste.impl.TasteTestCase;
@@ -33,9 +36,6 @@ import org.apache.mahout.cf.taste.simila
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
-import java.util.NoSuchElementException;
-
 /** <p>Tests {@link FileDataModel}.</p> */
 public final class FileDataModelTest extends TasteTestCase {
 
@@ -55,6 +55,23 @@ public final class FileDataModelTest ext
       "456,789,0.5",
       "456,654,0.0",
       "456,999,0.2",};
+  
+  private static final String[] DATA_SPLITTED_WITH_TWO_SPACES = {
+      "123  456  0.1",
+      "123  789  0.6",
+      "123  654  0.7",
+      "234  123  0.5",
+      "234  234  1.0",
+      "234  999  0.9",
+      "345  789  0.6",
+      "345  654  0.7",
+      "345  123  1.0",
+      "345  234  0.5",
+      "345  999  0.5",
+      "456  456  0.1",
+      "456  789  0.5",
+      "456  654  0.0",
+      "456  999  0.2",};
 
   private DataModel model;
   private File testFile;
@@ -67,6 +84,15 @@ public final class FileDataModelTest ext
     writeLines(testFile, DATA);
     model = new FileDataModel(testFile);
   }
+  
+  @Test
+  public void testReadRegexSplittedFile() throws Exception {
+    File testFile = getTestTempFile("testRegex.txt");
+    writeLines(testFile, DATA_SPLITTED_WITH_TWO_SPACES);
+    FileDataModel model = new FileDataModel(testFile,"\\s+");
+    assertEquals(model.getItemIDsFromUser(123).size(), 3);
+    assertEquals(model.getItemIDsFromUser(456).size(), 4);
+  }
 
   @Test
   public void testFile() throws Exception {