You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by pa...@apache.org on 2014/09/04 19:28:30 UTC

git commit: MAHOUT-1604, MAHOUT-1541 changes all reference to positon in the CLI to columns

Repository: mahout
Updated Branches:
  refs/heads/master d9e26c64d -> e24c4afb6


MAHOUT-1604, MAHOUT-1541 changes all reference to positon in the CLI to columns


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/e24c4afb
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/e24c4afb
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/e24c4afb

Branch: refs/heads/master
Commit: e24c4afb699c2930d372c701fe2de874a2a2f6c0
Parents: d9e26c6
Author: pferrel <pa...@occamsmachete.com>
Authored: Thu Sep 4 09:44:00 2014 -0700
Committer: pferrel <pa...@occamsmachete.com>
Committed: Thu Sep 4 09:55:17 2014 -0700

----------------------------------------------------------------------
 .../mahout/drivers/ItemSimilarityDriver.scala   | 10 ++--
 .../mahout/drivers/MahoutOptionParser.scala     | 24 ++++----
 .../org/apache/mahout/drivers/Schema.scala      |  4 +-
 .../drivers/TextDelimitedReaderWriter.scala     | 10 ++--
 .../drivers/ItemSimilarityDriverSuite.scala     | 60 ++++++++++----------
 5 files changed, 54 insertions(+), 54 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
index b05b55d..0b8ded6 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/ItemSimilarityDriver.scala
@@ -128,13 +128,13 @@ object ItemSimilarityDriver extends MahoutDriver {
 
     val readSchema1 = new Schema("delim" -> parser.opts("inDelim").asInstanceOf[String],
       "filter" -> parser.opts("filter1").asInstanceOf[String],
-      "rowIDPosition" -> parser.opts("rowIDPosition").asInstanceOf[Int],
-      "columnIDPosition" -> parser.opts("itemIDPosition").asInstanceOf[Int],
-      "filterPosition" -> parser.opts("filterPosition").asInstanceOf[Int])
+      "rowIDColumn" -> parser.opts("rowIDColumn").asInstanceOf[Int],
+      "columnIDPosition" -> parser.opts("itemIDColumn").asInstanceOf[Int],
+      "filterColumn" -> parser.opts("filterColumn").asInstanceOf[Int])
 
     reader1 = new TextDelimitedIndexedDatasetReader(readSchema1)
 
-    if ((parser.opts("filterPosition").asInstanceOf[Int] != -1 && parser.opts("filter2").asInstanceOf[String] != null)
+    if ((parser.opts("filterColumn").asInstanceOf[Int] != -1 && parser.opts("filter2").asInstanceOf[String] != null)
       || (parser.opts("input2").asInstanceOf[String] != null && !parser.opts("input2").asInstanceOf[String].isEmpty )){
       // only need to change the filter used compared to readSchema1
       val readSchema2 = new Schema(readSchema1) += ("filter" -> parser.opts("filter2").asInstanceOf[String])
@@ -180,7 +180,7 @@ object ItemSimilarityDriver extends MahoutDriver {
 
         datasetB
 
-      } else if (parser.opts("filterPosition").asInstanceOf[Int] != -1
+      } else if (parser.opts("filterColumn").asInstanceOf[Int] != -1
         && parser.opts("filter2").asInstanceOf[String] != null) {
 
         // get cross-cooccurrences interactions by using two filters on a single set of files

http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala b/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
index 6908bd2..ad7a76b 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/MahoutOptionParser.scala
@@ -41,9 +41,9 @@ object MahoutOptionParser {
     "filenamePattern" -> "^part-.*")
 
   final val TextDelimitedElementsOptions = immutable.HashMap[String, Any](
-    "rowIDPosition" -> 0,
-    "itemIDPosition" -> 1,
-    "filterPosition" -> -1,
+    "rowIDColumn" -> 0,
+    "itemIDColumn" -> 1,
+    "filterColumn" -> -1,
     "filter1" -> null.asInstanceOf[String],
     "filter2" -> null.asInstanceOf[String],
     "inDelim" -> "[,\t ]")
@@ -135,20 +135,20 @@ class MahoutOptionParser(programName: String) extends OptionParser[Map[String, A
       options + ("filter2" -> x)
     } text ("String (or regex) whose presence indicates a datum for the secondary item set (optional). If not present no secondary dataset is collected")
 
-    opt[Int]("rowIDPosition") abbr ("rc") action { (x, options) =>
-      options + ("rowIDPosition" -> x)
+    opt[Int]("rowIDColumn") abbr ("rc") action { (x, options) =>
+      options + ("rowIDColumn" -> x)
     } text ("Column number (0 based Int) containing the row ID string (optional). Default: 0") validate { x =>
       if (x >= 0) success else failure("Option --rowIDColNum must be >= 0")
     }
 
-    opt[Int]("itemIDPosition") abbr ("ic") action { (x, options) =>
-      options + ("itemIDPosition" -> x)
+    opt[Int]("itemIDColumn") abbr ("ic") action { (x, options) =>
+      options + ("itemIDColumn" -> x)
     } text ("Column number (0 based Int) containing the item ID string (optional). Default: 1") validate { x =>
       if (x >= 0) success else failure("Option --itemIDColNum must be >= 0")
     }
 
-    opt[Int]("filterPosition") abbr ("fc") action { (x, options) =>
-      options + ("filterPosition" -> x)
+    opt[Int]("filterColumn") abbr ("fc") action { (x, options) =>
+      options + ("filterColumn" -> x)
     } text ("Column number (0 based Int) containing the filter string (optional). Default: -1 for no filter") validate { x =>
       if (x >= -1) success else failure("Option --filterColNum must be >= -1")
     }
@@ -156,9 +156,9 @@ class MahoutOptionParser(programName: String) extends OptionParser[Map[String, A
     note("\nUsing all defaults the input is expected of the form: \"userID<tab>itemId\" or \"userID<tab>itemID<tab>any-text...\" and all rows will be used")
 
     checkConfig { options: Map[String, Any] =>
-      if (options("filterPosition").asInstanceOf[Int] == options("itemIDPosition").asInstanceOf[Int]
-        || options("filterPosition").asInstanceOf[Int] == options("rowIDPosition").asInstanceOf[Int]
-        || options("rowIDPosition").asInstanceOf[Int] == options("itemIDPosition").asInstanceOf[Int])
+      if (options("filterColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int]
+        || options("filterColumn").asInstanceOf[Int] == options("rowIDColumn").asInstanceOf[Int]
+        || options("rowIDColumn").asInstanceOf[Int] == options("itemIDColumn").asInstanceOf[Int])
         failure("The row, item, and filter positions must be unique.") else success
     }
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala b/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala
index 42b2658..92163be 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/Schema.scala
@@ -48,9 +48,9 @@ class Schema(params: Tuple2[String, Any]*) extends HashMap[String, Any] {
 class DefaultElementReadSchema extends Schema(
   "delim" -> "[,\t ]", //comma, tab or space
   "filter" -> "",
-  "rowIDPosition" -> 0,
+  "rowIDColumn" -> 0,
   "columnIDPosition" -> 1,
-  "filterPosition" -> -1)
+  "filterColumn" -> -1)
 
 /** Default Schema for text delimited drm file output
   * This tells the writer to write a DRM of the default form:

http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala
----------------------------------------------------------------------
diff --git a/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala b/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala
index 53a36a5..274ad98 100644
--- a/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala
+++ b/spark/src/main/scala/org/apache/mahout/drivers/TextDelimitedReaderWriter.scala
@@ -43,9 +43,9 @@ trait TDIndexedDatasetReader extends Reader[IndexedDataset]{
       existingRowIDs: BiMap[String, Int] = HashBiMap.create()): IndexedDataset = {
     try {
       val delimiter = readSchema("delim").asInstanceOf[String]
-      val rowIDPosition = readSchema("rowIDPosition").asInstanceOf[Int]
+      val rowIDColumn = readSchema("rowIDColumn").asInstanceOf[Int]
       val columnIDPosition = readSchema("columnIDPosition").asInstanceOf[Int]
-      val filterPosition = readSchema("filterPosition").asInstanceOf[Int]
+      val filterColumn = readSchema("filterColumn").asInstanceOf[Int]
       val filterBy = readSchema("filter").asInstanceOf[String]
       // instance vars must be put into locally scoped vals when used in closures that are executed but Spark
 
@@ -57,15 +57,15 @@ trait TDIndexedDatasetReader extends Reader[IndexedDataset]{
       var columns = mc.textFile(source).map { line => line.split(delimiter) }
 
       // -1 means no filter in the input text, take them all
-      if(filterPosition != -1) {
+      if(filterColumn != -1) {
         // get the rows that have a column matching the filter
-        columns = columns.filter { tokens => tokens(filterPosition) == filterBy }
+        columns = columns.filter { tokens => tokens(filterColumn) == filterBy }
       }
 
       // get row and column IDs
       //val m = columns.collect
       val interactions = columns.map { tokens =>
-        tokens(rowIDPosition) -> tokens(columnIDPosition)
+        tokens(rowIDColumn) -> tokens(columnIDPosition)
       }
 
       interactions.cache()

http://git-wip-us.apache.org/repos/asf/mahout/blob/e24c4afb/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
----------------------------------------------------------------------
diff --git a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
index 0a73469..79cd6d9 100644
--- a/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
+++ b/spark/src/test/scala/org/apache/mahout/drivers/ItemSimilarityDriverSuite.scala
@@ -94,9 +94,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
         "--filter1", "purchase",
         "--filter2", "view",
         "--inDelim", ",",
-        "--itemIDPosition", "2",
-        "--rowIDPosition", "0",
-        "--filterPosition", "1"
+        "--itemIDColumn", "2",
+        "--rowIDColumn", "0",
+        "--filterColumn", "1"
     ))
 */
   // local multi-threaded Spark with HDFS using large dataset
@@ -108,9 +108,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1"
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1"
     ))
   */
 
@@ -153,9 +153,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1",
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1",
       "--writeAllDatasets"))
 
     // todo: these comparisons rely on a sort producing the same lines, which could possibly
@@ -207,9 +207,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", "[,\t]",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1"))
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1"))
 
     // todo: a better test would be to get sorted vectors and compare rows instead of tokens, this might miss
     // some error cases
@@ -259,9 +259,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", "\t",
-      "--itemIDPosition", "4",
-      "--rowIDPosition", "1",
-      "--filterPosition", "2"))
+      "--itemIDColumn", "4",
+      "--rowIDColumn", "1",
+      "--filterColumn", "2"))
 
 
     val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable
@@ -420,9 +420,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", "\t",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1",
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1",
       "--filenamePattern", "m..tsv",
       "--recursive"))
 
@@ -475,9 +475,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1"))
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1"))
 
     val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable
     tokenize(indicatorLines) should contain theSameElementsAs SelfSimilairtyTokens
@@ -541,9 +541,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1"))
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1"))
 
     val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable
     val crossIndicatorLines = mahoutCtx.textFile(OutPath + "/cross-indicator-matrix/").collect.toIterable
@@ -603,9 +603,9 @@ class ItemSimilarityDriverSuite extends FunSuite with DistributedSparkSuite {
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1",
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1",
       "--writeAllDatasets"))
 
     val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable
@@ -697,9 +697,9 @@ removed ==> u3	0	      0	      1	          0
       "--filter1", "purchase",
       "--filter2", "view",
       "--inDelim", ",",
-      "--itemIDPosition", "2",
-      "--rowIDPosition", "0",
-      "--filterPosition", "1",
+      "--itemIDColumn", "2",
+      "--rowIDColumn", "0",
+      "--filterColumn", "1",
       "--writeAllDatasets"))
 
     val indicatorLines = mahoutCtx.textFile(OutPath + "/indicator-matrix/").collect.toIterable